Index: head/sys/contrib/ipfilter/netinet/ip_auth.c =================================================================== --- head/sys/contrib/ipfilter/netinet/ip_auth.c (revision 105193) +++ head/sys/contrib/ipfilter/netinet/ip_auth.c (revision 105194) @@ -1,630 +1,631 @@ /* * Copyright (C) 1998-2001 by Darren Reed & Guido van Rooij. * * See the IPFILTER.LICENCE file for details on licencing. */ #ifdef __sgi # include #endif #include #include #include #include #include #if !defined(_KERNEL) && !defined(KERNEL) # include # include # include #endif #if (defined(KERNEL) || defined(_KERNEL)) && (__FreeBSD_version >= 220000) # include # include #else # include #endif #ifndef linux # include #endif #include #if (defined(_KERNEL) || defined(KERNEL)) && !defined(linux) # include #endif #if !defined(__SVR4) && !defined(__svr4__) # ifndef linux # include # endif #else # include # include # ifdef _KERNEL # include # endif # include # include #endif #if (_BSDI_VERSION >= 199802) || (__FreeBSD_version >= 400000) # include #endif #if defined(__NetBSD__) || defined(__OpenBSD__) || defined(bsdi) # include #endif #include #ifdef sun # include #endif #include #include #include #include #ifndef KERNEL # define KERNEL # define NOT_KERNEL #endif #ifndef linux # include #endif #ifdef NOT_KERNEL # undef KERNEL #endif #ifdef __sgi # ifdef IFF_DRVRLOCK /* IRIX6 */ # include # endif #endif #include #if defined(__sgi) && !defined(IFF_DRVRLOCK) /* IRIX < 6 */ extern struct ifqueue ipintrq; /* ip packet input queue */ #else # ifndef linux # if __FreeBSD_version >= 300000 # include # endif # include # include # endif #endif #include #include #include "netinet/ip_compat.h" #include #include "netinet/ip_fil.h" #include "netinet/ip_auth.h" #if !SOLARIS && !defined(linux) # include # ifdef __FreeBSD__ # include # endif #endif #if (__FreeBSD_version >= 300000) # include # if (defined(_KERNEL) || defined(KERNEL)) && !defined(IPFILTER_LKM) # include # include # endif #endif #if !defined(lint) /* static const char rcsid[] = "@(#)$Id: ip_auth.c,v 2.11.2.12 2001/07/18 14:57:08 darrenr Exp $"; */ static const char rcsid[] = "@(#)$FreeBSD$"; #endif #if (SOLARIS || defined(__sgi)) && defined(_KERNEL) extern KRWLOCK_T ipf_auth, ipf_mutex; extern kmutex_t ipf_authmx; # if SOLARIS extern kcondvar_t ipfauthwait; # endif #endif #ifdef linux static struct wait_queue *ipfauthwait = NULL; #endif int fr_authsize = FR_NUMAUTH; int fr_authused = 0; int fr_defaultauthage = 600; int fr_auth_lock = 0; fr_authstat_t fr_authstats; static frauth_t fr_auth[FR_NUMAUTH]; mb_t *fr_authpkts[FR_NUMAUTH]; static int fr_authstart = 0, fr_authend = 0, fr_authnext = 0; static frauthent_t *fae_list = NULL; frentry_t *ipauth = NULL, *fr_authlist = NULL; /* * Check if a packet has authorization. If the packet is found to match an * authorization result and that would result in a feedback loop (i.e. it * will end up returning FR_AUTH) then return FR_BLOCK instead. */ u_32_t fr_checkauth(ip, fin) ip_t *ip; fr_info_t *fin; { u_short id = ip->ip_id; frentry_t *fr; frauth_t *fra; u_32_t pass; int i; if (fr_auth_lock || !fr_authused) return 0; READ_ENTER(&ipf_auth); for (i = fr_authstart; i != fr_authend; ) { /* * index becomes -2 only after an SIOCAUTHW. Check this in * case the same packet gets sent again and it hasn't yet been * auth'd. */ fra = fr_auth + i; if ((fra->fra_index == -2) && (id == fra->fra_info.fin_id) && !bcmp((char *)fin, (char *)&fra->fra_info, FI_CSIZE)) { /* * Avoid feedback loop. */ if (!(pass = fra->fra_pass) || (pass & FR_AUTH)) pass = FR_BLOCK; /* * Create a dummy rule for the stateful checking to * use and return. Zero out any values we don't * trust from userland! */ if ((pass & FR_KEEPSTATE) || ((pass & FR_KEEPFRAG) && (fin->fin_fi.fi_fl & FI_FRAG))) { KMALLOC(fr, frentry_t *); if (fr) { bcopy((char *)fra->fra_info.fin_fr, fr, sizeof(*fr)); fr->fr_grp = NULL; fr->fr_ifa = fin->fin_ifp; fr->fr_func = NULL; fr->fr_ref = 1; fr->fr_flags = pass; #if BSD >= 199306 fr->fr_oifa = NULL; #endif } } else fr = fra->fra_info.fin_fr; fin->fin_fr = fr; RWLOCK_EXIT(&ipf_auth); WRITE_ENTER(&ipf_auth); if (fr && fr != fra->fra_info.fin_fr) { fr->fr_next = fr_authlist; fr_authlist = fr; } fr_authstats.fas_hits++; fra->fra_index = -1; fr_authused--; if (i == fr_authstart) { while (fra->fra_index == -1) { i++; fra++; if (i == FR_NUMAUTH) { i = 0; fra = fr_auth; } fr_authstart = i; if (i == fr_authend) break; } if (fr_authstart == fr_authend) { fr_authnext = 0; fr_authstart = fr_authend = 0; } } RWLOCK_EXIT(&ipf_auth); return pass; } i++; if (i == FR_NUMAUTH) i = 0; } fr_authstats.fas_miss++; RWLOCK_EXIT(&ipf_auth); return 0; } /* * Check if we have room in the auth array to hold details for another packet. * If we do, store it and wake up any user programs which are waiting to * hear about these events. */ int fr_newauth(m, fin, ip) mb_t *m; fr_info_t *fin; ip_t *ip; { #if defined(_KERNEL) && SOLARIS qif_t *qif = fin->fin_qif; #endif frauth_t *fra; int i; if (fr_auth_lock) return 0; WRITE_ENTER(&ipf_auth); if (fr_authstart > fr_authend) { fr_authstats.fas_nospace++; RWLOCK_EXIT(&ipf_auth); return 0; } else { if (fr_authused == FR_NUMAUTH) { fr_authstats.fas_nospace++; RWLOCK_EXIT(&ipf_auth); return 0; } } fr_authstats.fas_added++; fr_authused++; i = fr_authend++; if (fr_authend == FR_NUMAUTH) fr_authend = 0; RWLOCK_EXIT(&ipf_auth); fra = fr_auth + i; fra->fra_index = i; fra->fra_pass = 0; fra->fra_age = fr_defaultauthage; bcopy((char *)fin, (char *)&fra->fra_info, sizeof(*fin)); #if SOLARIS && defined(_KERNEL) # if !defined(sparc) /* * No need to copyback here as we want to undo the changes, not keep * them. */ if ((ip == (ip_t *)m->b_rptr) && (ip->ip_v == 4)) { register u_short bo; bo = ip->ip_len; ip->ip_len = htons(bo); # if !SOLARIS && !defined(__NetBSD__) && !defined(__FreeBSD__) /* 4.4BSD converts this ip_input.c, but I don't in solaris.c */ bo = ip->ip_id; ip->ip_id = htons(bo); # endif bo = ip->ip_off; ip->ip_off = htons(bo); } # endif m->b_rptr -= qif->qf_off; fr_authpkts[i] = *(mblk_t **)fin->fin_mp; fra->fra_q = qif->qf_q; cv_signal(&ipfauthwait); #else # if defined(BSD) && !defined(sparc) && (BSD >= 199306) if (!fin->fin_out) { ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); } # endif fr_authpkts[i] = m; WAKEUP(&fr_authnext); #endif return 1; } int fr_auth_ioctl(data, mode, cmd, fr, frptr) caddr_t data; int mode; #if defined(__NetBSD__) || defined(__OpenBSD__) || (__FreeBSD_version >= 300003) u_long cmd; #else int cmd; #endif frentry_t *fr, **frptr; { mb_t *m; #if defined(_KERNEL) && !SOLARIS int s; #endif frauth_t auth, *au = &auth, *fra; frauthent_t *fae, **faep; int i, error = 0; switch (cmd) { case SIOCSTLCK : error = fr_lock(data, &fr_auth_lock); break; case SIOCINIFR : case SIOCRMIFR : case SIOCADIFR : error = EINVAL; break; case SIOCINAFR : error = EINVAL; break; case SIOCRMAFR : case SIOCADAFR : for (faep = &fae_list; (fae = *faep); ) if (&fae->fae_fr == fr) break; else faep = &fae->fae_next; if (cmd == SIOCRMAFR) { if (!fr || !frptr) error = EINVAL; else if (!fae) error = ESRCH; else { WRITE_ENTER(&ipf_auth); SPL_NET(s); *faep = fae->fae_next; *frptr = fr->fr_next; SPL_X(s); RWLOCK_EXIT(&ipf_auth); KFREE(fae); } } else if (fr && frptr) { KMALLOC(fae, frauthent_t *); if (fae != NULL) { bcopy((char *)fr, (char *)&fae->fae_fr, sizeof(*fr)); WRITE_ENTER(&ipf_auth); SPL_NET(s); fae->fae_age = fr_defaultauthage; fae->fae_fr.fr_hits = 0; fae->fae_fr.fr_next = *frptr; *frptr = &fae->fae_fr; fae->fae_next = *faep; *faep = fae; ipauth = &fae_list->fae_fr; SPL_X(s); RWLOCK_EXIT(&ipf_auth); } else error = ENOMEM; } else error = EINVAL; break; case SIOCATHST: fr_authstats.fas_faelist = fae_list; error = IWCOPYPTR((char *)&fr_authstats, data, sizeof(fr_authstats)); break; case SIOCAUTHW: if (!(mode & FWRITE)) { error = EPERM; break; } fr_authioctlloop: READ_ENTER(&ipf_auth); if ((fr_authnext != fr_authend) && fr_authpkts[fr_authnext]) { error = IWCOPYPTR((char *)&fr_auth[fr_authnext], data, sizeof(frauth_t)); RWLOCK_EXIT(&ipf_auth); if (error) break; WRITE_ENTER(&ipf_auth); SPL_NET(s); fr_authnext++; if (fr_authnext == FR_NUMAUTH) fr_authnext = 0; SPL_X(s); RWLOCK_EXIT(&ipf_auth); return 0; } RWLOCK_EXIT(&ipf_auth); #ifdef _KERNEL # if SOLARIS mutex_enter(&ipf_authmx); if (!cv_wait_sig(&ipfauthwait, &ipf_authmx)) { mutex_exit(&ipf_authmx); return EINTR; } mutex_exit(&ipf_authmx); # else error = SLEEP(&fr_authnext, "fr_authnext"); # endif #endif if (!error) goto fr_authioctlloop; break; case SIOCAUTHR: if (!(mode & FWRITE)) { error = EPERM; break; } error = IRCOPYPTR(data, (caddr_t)&auth, sizeof(auth)); if (error) return error; WRITE_ENTER(&ipf_auth); SPL_NET(s); i = au->fra_index; fra = fr_auth + i; if ((i < 0) || (i > FR_NUMAUTH) || (fra->fra_info.fin_id != au->fra_info.fin_id)) { SPL_X(s); RWLOCK_EXIT(&ipf_auth); return EINVAL; } m = fr_authpkts[i]; fra->fra_index = -2; fra->fra_pass = au->fra_pass; fr_authpkts[i] = NULL; RWLOCK_EXIT(&ipf_auth); #ifdef _KERNEL if (m && au->fra_info.fin_out) { # if SOLARIS error = (fr_qout(fra->fra_q, m) == 0) ? EINVAL : 0; # else /* SOLARIS */ struct route ro; bzero((char *)&ro, sizeof(ro)); # if ((_BSDI_VERSION >= 199802) && (_BSDI_VERSION < 200005)) || \ - defined(__OpenBSD__) || (defined(IRIX) && (IRIX >= 605)) + defined(__OpenBSD__) || (defined(IRIX) && (IRIX >= 605)) || \ + (__FreeBSD_version >= 500042) error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); # else error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL); # endif if (ro.ro_rt) { RTFREE(ro.ro_rt); } # endif /* SOLARIS */ if (error) fr_authstats.fas_sendfail++; else fr_authstats.fas_sendok++; } else if (m) { # if SOLARIS error = (fr_qin(fra->fra_q, m) == 0) ? EINVAL : 0; # else /* SOLARIS */ if (! IF_HANDOFF(&ipintrq, m, NULL)) error = ENOBUFS; else schednetisr(NETISR_IP); # endif /* SOLARIS */ if (error) fr_authstats.fas_quefail++; else fr_authstats.fas_queok++; } else error = EINVAL; # if SOLARIS if (error) error = EINVAL; # else /* * If we experience an error which will result in the packet * not being processed, make sure we advance to the next one. */ if (error == ENOBUFS) { fr_authused--; fra->fra_index = -1; fra->fra_pass = 0; if (i == fr_authstart) { while (fra->fra_index == -1) { i++; if (i == FR_NUMAUTH) i = 0; fr_authstart = i; if (i == fr_authend) break; } if (fr_authstart == fr_authend) { fr_authnext = 0; fr_authstart = fr_authend = 0; } } } # endif #endif /* _KERNEL */ SPL_X(s); break; default : error = EINVAL; break; } return error; } /* * Free all network buffer memory used to keep saved packets. */ void fr_authunload() { register int i; register frauthent_t *fae, **faep; frentry_t *fr, **frp; mb_t *m; WRITE_ENTER(&ipf_auth); for (i = 0; i < FR_NUMAUTH; i++) { if ((m = fr_authpkts[i])) { FREE_MB_T(m); fr_authpkts[i] = NULL; fr_auth[i].fra_index = -1; } } for (faep = &fae_list; (fae = *faep); ) { *faep = fae->fae_next; KFREE(fae); } ipauth = NULL; RWLOCK_EXIT(&ipf_auth); if (fr_authlist) { /* * We *MuST* reget ipf_auth because otherwise we won't get the * locks in the right order and risk deadlock. * We need ipf_mutex here to prevent a rule from using it * inside fr_check(). */ WRITE_ENTER(&ipf_mutex); WRITE_ENTER(&ipf_auth); for (frp = &fr_authlist; (fr = *frp); ) { if (fr->fr_ref == 1) { *frp = fr->fr_next; KFREE(fr); } else frp = &fr->fr_next; } RWLOCK_EXIT(&ipf_auth); RWLOCK_EXIT(&ipf_mutex); } } /* * Slowly expire held auth records. Timeouts are set * in expectation of this being called twice per second. */ void fr_authexpire() { register int i; register frauth_t *fra; register frauthent_t *fae, **faep; register frentry_t *fr, **frp; mb_t *m; #if !SOLARIS && defined(_KERNEL) int s; #endif if (fr_auth_lock) return; SPL_NET(s); WRITE_ENTER(&ipf_auth); for (i = 0, fra = fr_auth; i < FR_NUMAUTH; i++, fra++) { if ((!--fra->fra_age) && (m = fr_authpkts[i])) { FREE_MB_T(m); fr_authpkts[i] = NULL; fr_auth[i].fra_index = -1; fr_authstats.fas_expire++; fr_authused--; } } for (faep = &fae_list; (fae = *faep); ) { if (!--fae->fae_age) { *faep = fae->fae_next; KFREE(fae); fr_authstats.fas_expire++; } else faep = &fae->fae_next; } if (fae_list != NULL) ipauth = &fae_list->fae_fr; else ipauth = NULL; for (frp = &fr_authlist; (fr = *frp); ) { if (fr->fr_ref == 1) { *frp = fr->fr_next; KFREE(fr); } else frp = &fr->fr_next; } RWLOCK_EXIT(&ipf_auth); SPL_X(s); } Index: head/sys/dev/hfa/fore_receive.c =================================================================== --- head/sys/dev/hfa/fore_receive.c (revision 105193) +++ head/sys/dev/hfa/fore_receive.c (revision 105194) @@ -1,596 +1,596 @@ /* * * =================================== * HARP | Host ATM Research Platform * =================================== * * * This Host ATM Research Platform ("HARP") file (the "Software") is * made available by Network Computing Services, Inc. ("NetworkCS") * "AS IS". NetworkCS does not provide maintenance, improvements or * support of any kind. * * NETWORKCS MAKES NO WARRANTIES OR REPRESENTATIONS, EXPRESS OR IMPLIED, * INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF MERCHANTABILITY * AND FITNESS FOR A PARTICULAR PURPOSE, AS TO ANY ELEMENT OF THE * SOFTWARE OR ANY SUPPORT PROVIDED IN CONNECTION WITH THIS SOFTWARE. * In no event shall NetworkCS be responsible for any damages, including * but not limited to consequential damages, arising from or relating to * any use of the Software or related support. * * Copyright 1994-1998 Network Computing Services, Inc. * * Copies of this Software may be made, however, the above copyright * notice must be reproduced on all copies. * * @(#) $FreeBSD$ * */ /* * FORE Systems 200-Series Adapter Support * --------------------------------------- * * Receive queue management * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef lint __RCSID("@(#) $FreeBSD$"); #endif /* * Local functions */ static void fore_recv_stack(void *, KBuffer *); /* * Allocate Receive Queue Data Structures * * Arguments: * fup pointer to device unit structure * * Returns: * 0 allocations successful * else allocation failed */ int fore_recv_allocate(fup) Fore_unit *fup; { caddr_t memp; /* * Allocate non-cacheable memory for receive status words */ memp = atm_dev_alloc(sizeof(Q_status) * RECV_QUELEN, QSTAT_ALIGN, ATM_DEV_NONCACHE); if (memp == NULL) { return (1); } fup->fu_recv_stat = (Q_status *) memp; memp = (caddr_t)vtophys(fup->fu_recv_stat); if (memp == NULL) { return (1); } fup->fu_recv_statd = (Q_status *) memp; /* * Allocate memory for receive descriptors */ memp = atm_dev_alloc(sizeof(Recv_descr) * RECV_QUELEN, RECV_DESCR_ALIGN, 0); if (memp == NULL) { return (1); } fup->fu_recv_desc = (Recv_descr *) memp; memp = (caddr_t)vtophys(fup->fu_recv_desc); if (memp == NULL) { return (1); } fup->fu_recv_descd = (Recv_descr *) memp; return (0); } /* * Receive Queue Initialization * * Allocate and initialize the host-resident receive queue structures * and then initialize the CP-resident queue structures. * * Called at interrupt level. * * Arguments: * fup pointer to device unit structure * * Returns: * none */ void fore_recv_initialize(fup) Fore_unit *fup; { Aali *aap = fup->fu_aali; Recv_queue *cqp; H_recv_queue *hrp; Recv_descr *rdp; Recv_descr *rdp_dma; Q_status *qsp; Q_status *qsp_dma; int i; /* * Point to CP-resident receive queue */ cqp = (Recv_queue *)(fup->fu_ram + CP_READ(aap->aali_recv_q)); /* * Point to host-resident receive queue structures */ hrp = fup->fu_recv_q; qsp = fup->fu_recv_stat; qsp_dma = fup->fu_recv_statd; rdp = fup->fu_recv_desc; rdp_dma = fup->fu_recv_descd; /* * Loop thru all queue entries and do whatever needs doing */ for (i = 0; i < RECV_QUELEN; i++) { /* * Set queue status word to free */ *qsp = QSTAT_FREE; /* * Set up host queue entry and link into ring */ hrp->hrq_cpelem = cqp; hrp->hrq_status = qsp; hrp->hrq_descr = rdp; hrp->hrq_descr_dma = rdp_dma; if (i == (RECV_QUELEN - 1)) hrp->hrq_next = fup->fu_recv_q; else hrp->hrq_next = hrp + 1; /* * Now let the CP into the game */ cqp->cq_descr = (CP_dma) CP_WRITE(rdp_dma); cqp->cq_status = (CP_dma) CP_WRITE(qsp_dma); /* * Bump all queue pointers */ hrp++; qsp++; qsp_dma++; rdp++; rdp_dma++; cqp++; } /* * Initialize queue pointers */ fup->fu_recv_head = fup->fu_recv_q; return; } /* * Drain Receive Queue * * This function will process all completed entries at the head of the * receive queue. The received segments will be linked into a received * PDU buffer chain and it will then be passed up the PDU's VCC stack for * processing by the next higher protocol layer. * * May be called in interrupt state. * Must be called with interrupts locked out. * * Arguments: * fup pointer to device unit structure * * Returns: * none */ void fore_recv_drain(fup) Fore_unit *fup; { H_recv_queue *hrp = NULL; Recv_descr *rdp; Recv_seg_descr *rsp; Buf_handle *bhp; Fore_vcc *fvp; struct vccb *vcp; KBuffer *m, *mhead, *mtail; caddr_t cp; u_long hdr, nsegs; u_int seglen, type0; int i, pdulen, retries = 0, error; /* Silence the compiler */ mtail = NULL; type0 = 0; /* * Process each completed entry */ retry: while (*fup->fu_recv_head->hrq_status & QSTAT_COMPLETED) { /* * Get completed entry's receive descriptor */ hrp = fup->fu_recv_head; rdp = hrp->hrq_descr; #ifdef VAC /* * Cache flush receive descriptor */ if (vac) { vac_flush((addr_t)rdp, sizeof(Recv_descr)); } #endif hdr = rdp->rd_cell_hdr; nsegs = rdp->rd_nsegs; pdulen = 0; error = 0; mhead = NULL; /* * Locate incoming VCC for this PDU */ fvp = (Fore_vcc *) atm_dev_vcc_find((Cmn_unit *)fup, ATM_HDR_GET_VPI(hdr), ATM_HDR_GET_VCI(hdr), VCC_IN); /* * Check for a receive error * * Apparently the receive descriptor itself contains valid * information, but the received pdu data is probably bogus. * We'll arrange for the receive buffer segments to be tossed. */ if (*hrp->hrq_status & QSTAT_ERROR) { fup->fu_pif.pif_ierrors++; if (fvp) { vcp = fvp->fv_connvc->cvc_vcc; vcp->vc_ierrors++; if (vcp->vc_nif) vcp->vc_nif->nif_if.if_ierrors++; } ATM_DEBUG1("fore receive error: hdr=0x%lx\n", hdr); error = 1; } /* * Build PDU buffer chain from receive segments */ for (i = 0, rsp = rdp->rd_seg; i < nsegs; i++, rsp++) { bhp = rsp->rsd_handle; seglen = rsp->rsd_len; /* * Remove buffer from our supplied queue and get * to the underlying buffer */ switch (bhp->bh_type) { case BHT_S1_SMALL: DEQUEUE(bhp, Buf_handle, bh_qelem, fup->fu_buf1s_bq); fup->fu_buf1s_cnt--; m = (KBuffer *) ((caddr_t)bhp - BUF1_SM_HOFF); KB_DATASTART(m, cp, caddr_t); break; case BHT_S1_LARGE: DEQUEUE(bhp, Buf_handle, bh_qelem, fup->fu_buf1l_bq); fup->fu_buf1l_cnt--; m = (KBuffer *) ((caddr_t)bhp - BUF1_LG_HOFF); KB_DATASTART(m, cp, caddr_t); break; default: log(LOG_ERR, "fore_recv_drain: bhp=%p type=0x%x\n", bhp, bhp->bh_type); panic("fore_recv_drain: bad buffer type"); } /* * Toss any zero-length or receive error buffers */ if ((seglen == 0) || error) { KB_FREEALL(m); continue; } /* * Link buffer into chain */ if (mhead == NULL) { type0 = bhp->bh_type; KB_LINKHEAD(m, mhead); mhead = m; } else { KB_LINK(m, mtail); } KB_LEN(m) = seglen; pdulen += seglen; mtail = m; /* * Flush received buffer data */ #ifdef VAC if (vac) { addr_t dp; KB_DATASTART(m, dp, addr_t); vac_pageflush(dp); } #endif } /* * Make sure we've got a non-null PDU */ if (mhead == NULL) { goto free_ent; } /* * We only support user data PDUs (for now) */ if (hdr & ATM_HDR_SET_PT(ATM_PT_NONUSER)) { KB_FREEALL(mhead); goto free_ent; } /* * Toss the data if there's no VCC */ if (fvp == NULL) { fup->fu_stats->st_drv.drv_rv_novcc++; KB_FREEALL(mhead); goto free_ent; } #ifdef DIAGNOSTIC if (atm_dev_print) atm_dev_pdu_print((Cmn_unit *)fup, (Cmn_vcc *)fvp, mhead, "fore_recv"); #endif /* * Make sure we have our queueing headroom at the front * of the buffer chain */ if (type0 != BHT_S1_SMALL) { /* * Small buffers already have headroom built-in, but * if CP had to use a large buffer for the first * buffer, then we have to allocate a buffer here to * contain the headroom. */ fup->fu_stats->st_drv.drv_rv_nosbf++; KB_ALLOCPKT(m, BUF1_SM_SIZE, KB_F_NOWAIT, KB_T_DATA); if (m == NULL) { fup->fu_stats->st_drv.drv_rv_nomb++; KB_FREEALL(mhead); goto free_ent; } /* * Put new buffer at head of PDU chain */ KB_LINKHEAD(m, mhead); KB_LEN(m) = 0; KB_HEADSET(m, BUF1_SM_DOFF); mhead = m; } /* * It looks like we've got a valid PDU - count it quick!! */ mhead->m_pkthdr.rcvif = NULL; mhead->m_pkthdr.csum_flags = 0; - mhead->m_pkthdr.aux = NULL; + SLIST_INIT(&mhead->m_pkthdr.tags); KB_PLENSET(mhead, pdulen); fup->fu_pif.pif_ipdus++; fup->fu_pif.pif_ibytes += pdulen; vcp = fvp->fv_connvc->cvc_vcc; vcp->vc_ipdus++; vcp->vc_ibytes += pdulen; if (vcp->vc_nif) { vcp->vc_nif->nif_ibytes += pdulen; vcp->vc_nif->nif_if.if_ipackets++; #if (defined(BSD) && (BSD >= 199103)) vcp->vc_nif->nif_if.if_ibytes += pdulen; #endif } /* * The STACK_CALL needs to happen at splnet() in order * for the stack sequence processing to work. Schedule an * interrupt queue callback at splnet() since we are * currently at device level. */ /* * Prepend callback function pointer and token value to buffer. * We have already guaranteed that the space is available * in the first buffer. */ KB_HEADADJ(mhead, sizeof(atm_intr_func_t) + sizeof(int)); KB_DATASTART(mhead, cp, caddr_t); *((atm_intr_func_t *)cp) = fore_recv_stack; cp += sizeof(atm_intr_func_t); *((void **)cp) = (void *)fvp; /* * Schedule callback */ if (IF_HANDOFF(&atm_intrq, mhead, NULL)) { schednetisr(NETISR_ATM); } else { fup->fu_stats->st_drv.drv_rv_ifull++; goto free_ent; } free_ent: /* * Mark this entry free for use and bump head pointer * to the next entry in the queue */ *hrp->hrq_status = QSTAT_FREE; hrp->hrq_cpelem->cq_descr = (CP_dma) CP_WRITE((u_long)hrp->hrq_descr_dma); fup->fu_recv_head = hrp->hrq_next; } /* * Nearly all of the interrupts generated by the CP will be due * to PDU reception. However, we may receive an interrupt before * the CP has completed the status word DMA to host memory. Thus, * if we haven't processed any PDUs during this interrupt, we will * wait a bit for completed work on the receive queue, rather than * having to field an extra interrupt very soon. */ if (hrp == NULL) { if (++retries <= FORE_RECV_RETRY) { DELAY(FORE_RECV_DELAY); goto retry; } } return; } /* * Pass Incoming PDU up Stack * * This function is called via the core ATM interrupt queue callback * set in fore_recv_drain(). It will pass the supplied incoming * PDU up the incoming VCC's stack. * * Called at splnet. * * Arguments: * tok token to identify stack instantiation * m pointer to incoming PDU buffer chain * * Returns: * none */ static void fore_recv_stack(tok, m) void *tok; KBuffer *m; { Fore_vcc *fvp = (Fore_vcc *)tok; int err; /* * Send the data up the stack */ STACK_CALL(CPCS_UNITDATA_SIG, fvp->fv_upper, fvp->fv_toku, fvp->fv_connvc, (int)m, 0, err); if (err) KB_FREEALL(m); return; } /* * Free Receive Queue Data Structures * * Arguments: * fup pointer to device unit structure * * Returns: * none */ void fore_recv_free(fup) Fore_unit *fup; { /* * We'll just let fore_buf_free() take care of freeing any * buffers sitting on the receive queue (which are also still * on the fu_*_bq queue). */ if (fup->fu_flags & CUF_INITED) { } /* * Free the status words */ if (fup->fu_recv_stat) { atm_dev_free((volatile void *)fup->fu_recv_stat); fup->fu_recv_stat = NULL; fup->fu_recv_statd = NULL; } /* * Free the receive descriptors */ if (fup->fu_recv_desc) { atm_dev_free(fup->fu_recv_desc); fup->fu_recv_desc = NULL; fup->fu_recv_descd = NULL; } return; } Index: head/sys/kern/subr_mbuf.c =================================================================== --- head/sys/kern/subr_mbuf.c (revision 105193) +++ head/sys/kern/subr_mbuf.c (revision 105194) @@ -1,1550 +1,1543 @@ /*- * Copyright (c) 2001, 2002 * Bosko Milekic . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_mac.h" #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /****************************************************************************** * mb_alloc mbuf and cluster allocator. * * Maximum number of PCPU containers. If you know what you're doing you could * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your * system during compilation, and thus prevent kernel structure bloat. * * SMP and non-SMP kernels clearly have a different number of possible CPUs, * but because we cannot assume a dense array of CPUs, we always allocate * and traverse PCPU containers up to NCPU amount and merely check for * CPU availability. */ #ifdef MBALLOC_NCPU #define NCPU MBALLOC_NCPU #else #define NCPU MAXCPU #endif /*- * The mbuf allocator is heavily based on Alfred Perlstein's * (alfred@FreeBSD.org) "memcache" allocator which is itself based * on concepts from several per-CPU memory allocators. The difference * between this allocator and memcache is that, among other things: * * (i) We don't free back to the map from the free() routine - we leave the * option of implementing lazy freeing (from a kproc) in the future. * * (ii) We allocate from separate sub-maps of kmem_map, thus limiting the * maximum number of allocatable objects of a given type. Further, * we handle blocking on a cv in the case that the map is starved and * we have to rely solely on cached (circulating) objects. * * The mbuf allocator keeps all objects that it allocates in mb_buckets. * The buckets keep a page worth of objects (an object can be an mbuf or an * mbuf cluster) and facilitate moving larger sets of contiguous objects * from the per-CPU lists to the main list for the given object. The buckets * also have an added advantage in that after several moves from a per-CPU * list to the main list and back to the per-CPU list, contiguous objects * are kept together, thus trying to put the TLB cache to good use. * * The buckets are kept on singly-linked lists called "containers." A container * is protected by a mutex lock in order to ensure consistency. The mutex lock * itself is allocated separately and attached to the container at boot time, * thus allowing for certain containers to share the same mutex lock. Per-CPU * containers for mbufs and mbuf clusters all share the same per-CPU * lock whereas the "general system" containers (i.e., the "main lists") for * these objects share one global lock. */ struct mb_bucket { SLIST_ENTRY(mb_bucket) mb_blist; int mb_owner; int mb_numfree; void *mb_free[0]; }; struct mb_container { SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead; struct mtx *mc_lock; int mc_numowner; u_int mc_starved; long *mc_types; u_long *mc_objcount; u_long *mc_numpgs; }; struct mb_gen_list { struct mb_container mb_cont; struct cv mgl_mstarved; }; struct mb_pcpu_list { struct mb_container mb_cont; }; /* * Boot-time configurable object counts that will determine the maximum * number of permitted objects in the mbuf and mcluster cases. In the * ext counter (nmbcnt) case, it's just an indicator serving to scale * kmem_map size properly - in other words, we may be allowed to allocate * more than nmbcnt counters, whereas we will never be allowed to allocate * more than nmbufs mbufs or nmbclusters mclusters. * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be * allocatable by the sfbuf allocator (found in uipc_syscalls.c) */ #ifndef NMBCLUSTERS #define NMBCLUSTERS (1024 + maxusers * 64) #endif #ifndef NMBUFS #define NMBUFS (nmbclusters * 2) #endif #ifndef NSFBUFS #define NSFBUFS (512 + maxusers * 16) #endif #ifndef NMBCNTS #define NMBCNTS (nmbclusters + nsfbufs) #endif int nmbufs; int nmbclusters; int nmbcnt; int nsfbufs; /* * Perform sanity checks of tunables declared above. */ static void tunable_mbinit(void *dummy) { /* * This has to be done before VM init. */ nmbclusters = NMBCLUSTERS; TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); nmbufs = NMBUFS; TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); nsfbufs = NSFBUFS; TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); nmbcnt = NMBCNTS; TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt); /* Sanity checks */ if (nmbufs < nmbclusters * 2) nmbufs = nmbclusters * 2; if (nmbcnt < nmbclusters + nsfbufs) nmbcnt = nmbclusters + nsfbufs; } SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL); /* * The freelist structures and mutex locks. The number statically declared * here depends on the number of CPUs. * * We set up in such a way that all the objects (mbufs, clusters) * share the same mutex lock. It has been established that we do not benefit * from different locks for different objects, so we use the same lock, * regardless of object type. This also allows us to do optimised * multi-object allocations without dropping the lock in between. */ struct mb_lstmngr { struct mb_gen_list *ml_genlist; struct mb_pcpu_list *ml_cntlst[NCPU]; struct mb_bucket **ml_btable; vm_map_t ml_map; vm_offset_t ml_mapbase; vm_offset_t ml_maptop; int ml_mapfull; u_int ml_objsize; u_int *ml_wmhigh; }; static struct mb_lstmngr mb_list_mbuf, mb_list_clust; static struct mtx mbuf_gen, mbuf_pcpu[NCPU]; u_int *cl_refcntmap; /* * Local macros for internal allocator structure manipulations. */ #ifdef SMP #define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[PCPU_GET(cpuid)] #else #define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[0] #endif #define MB_GET_GEN_LIST(mb_lst) (mb_lst)->ml_genlist #define MB_LOCK_CONT(mb_cnt) mtx_lock((mb_cnt)->mb_cont.mc_lock) #define MB_UNLOCK_CONT(mb_cnt) mtx_unlock((mb_cnt)->mb_cont.mc_lock) #define MB_GET_PCPU_LIST_NUM(mb_lst, num) \ (mb_lst)->ml_cntlst[(num)] #define MB_BUCKET_INDX(mb_obj, mb_lst) \ (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / PAGE_SIZE) #define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \ { \ struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead); \ \ (mb_bckt)->mb_numfree--; \ (mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)]; \ (*((mb_lst)->mb_cont.mc_objcount))--; \ if ((mb_bckt)->mb_numfree == 0) { \ SLIST_REMOVE_HEAD(_mchd, mb_blist); \ SLIST_NEXT((mb_bckt), mb_blist) = NULL; \ (mb_bckt)->mb_owner |= MB_BUCKET_FREE; \ } \ } #define MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst) \ (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp); \ (mb_bckt)->mb_numfree++; \ (*((mb_lst)->mb_cont.mc_objcount))++; #define MB_MBTYPES_INC(mb_cnt, mb_type, mb_num) \ if ((mb_type) != MT_NOTMBUF) \ (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num) #define MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num) \ if ((mb_type) != MT_NOTMBUF) \ (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num) /* * Ownership of buckets/containers is represented by integers. The PCPU * lists range from 0 to NCPU-1. We need a free numerical id for the general * list (we use NCPU). We also need a non-conflicting free bit to indicate * that the bucket is free and removed from a container, while not losing * the bucket's originating container id. We use the highest bit * for the free marker. */ #define MB_GENLIST_OWNER (NCPU) #define MB_BUCKET_FREE (1 << (sizeof(int) * 8 - 1)) /* Statistics structures for allocator (per-CPU and general). */ static struct mbpstat mb_statpcpu[NCPU + 1]; struct mbstat mbstat; /* Sleep time for wait code (in ticks). */ static int mbuf_wait = 64; static u_int mbuf_limit = 512; /* Upper limit on # of mbufs per CPU. */ static u_int clust_limit = 128; /* Upper limit on # of clusters per CPU. */ /* * Objects exported by sysctl(8). */ SYSCTL_DECL(_kern_ipc); SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, "Maximum number of mbuf clusters available"); SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0, "Maximum number of mbufs available"); SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0, "Number used to scale kmem_map to ensure sufficient space for counters"); SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RD, &nsfbufs, 0, "Maximum number of sendfile(2) sf_bufs available"); SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0, "Sleep time of mbuf subsystem wait allocations during exhaustion"); SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_limit, CTLFLAG_RW, &mbuf_limit, 0, "Upper limit of number of mbufs allowed on each PCPU list"); SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_limit, CTLFLAG_RW, &clust_limit, 0, "Upper limit of number of mbuf clusters allowed on each PCPU list"); SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat, "Mbuf general information and statistics"); SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu, sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics"); /* * Prototypes of local allocator routines. */ static void *mb_alloc_wait(struct mb_lstmngr *, short); static struct mb_bucket *mb_pop_cont(struct mb_lstmngr *, int, struct mb_pcpu_list *); static void mb_reclaim(void); static void mbuf_init(void *); /* * Initial allocation numbers. Each parameter represents the number of buckets * of each object that will be placed initially in each PCPU container for * said object. */ #define NMB_MBUF_INIT 4 #define NMB_CLUST_INIT 16 /* * Internal flags that allow for cache locks to remain "persistent" across * allocation and free calls. They may be used in combination. */ #define MBP_PERSIST 0x1 /* Return with lock still held. */ #define MBP_PERSISTENT 0x2 /* Cache lock is already held coming in. */ /* * Initialize the mbuf subsystem. * * We sub-divide the kmem_map into several submaps; this way, we don't have * to worry about artificially limiting the number of mbuf or mbuf cluster * allocations, due to fear of one type of allocation "stealing" address * space initially reserved for another. * * Set up both the general containers and all the PCPU containers. Populate * the PCPU containers with initial numbers. */ MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures"); SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL) static void mbuf_init(void *dummy) { struct mb_pcpu_list *pcpu_cnt; vm_size_t mb_map_size; int i, j; /* * Set up all the submaps, for each type of object that we deal * with in this allocator. We also allocate space for the cluster * ref. counts in the mbuf map (and not the cluster map) in order to * give clusters a nice contiguous address space without any holes. */ mb_map_size = (vm_size_t)(nmbufs * MSIZE + nmbclusters * sizeof(u_int)); mb_map_size = rounddown(mb_map_size, PAGE_SIZE); mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT); if (mb_list_mbuf.ml_btable == NULL) goto bad; mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase), &(mb_list_mbuf.ml_maptop), mb_map_size); mb_list_mbuf.ml_map->system_map = 1; mb_list_mbuf.ml_mapfull = 0; mb_list_mbuf.ml_objsize = MSIZE; mb_list_mbuf.ml_wmhigh = &mbuf_limit; mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES); mb_map_size = rounddown(mb_map_size, PAGE_SIZE); mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT); if (mb_list_clust.ml_btable == NULL) goto bad; mb_list_clust.ml_map = kmem_suballoc(kmem_map, &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop), mb_map_size); mb_list_clust.ml_map->system_map = 1; mb_list_clust.ml_mapfull = 0; mb_list_clust.ml_objsize = MCLBYTES; mb_list_clust.ml_wmhigh = &clust_limit; /* * Allocate required general (global) containers for each object type. */ mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF, M_NOWAIT); mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF, M_NOWAIT); if ((mb_list_mbuf.ml_genlist == NULL) || (mb_list_clust.ml_genlist == NULL)) goto bad; /* * Initialize condition variables and general container mutex locks. */ mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", NULL, 0); cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved"); cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved), "mcluster pool starved"); mb_list_mbuf.ml_genlist->mb_cont.mc_lock = mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen; /* * Set up the general containers for each object. */ mb_list_mbuf.ml_genlist->mb_cont.mc_numowner = mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER; mb_list_mbuf.ml_genlist->mb_cont.mc_starved = mb_list_clust.ml_genlist->mb_cont.mc_starved = 0; mb_list_mbuf.ml_genlist->mb_cont.mc_objcount = &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree); mb_list_clust.ml_genlist->mb_cont.mc_objcount = &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree); mb_list_mbuf.ml_genlist->mb_cont.mc_numpgs = &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbpgs); mb_list_clust.ml_genlist->mb_cont.mc_numpgs = &(mb_statpcpu[MB_GENLIST_OWNER].mb_clpgs); mb_list_mbuf.ml_genlist->mb_cont.mc_types = &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]); mb_list_clust.ml_genlist->mb_cont.mc_types = NULL; SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead)); SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead)); /* * Allocate all the required counters for clusters. This makes * cluster allocations/deallocations much faster. */ cl_refcntmap = (u_int *)kmem_malloc(mb_list_clust.ml_map, roundup(nmbclusters * sizeof(u_int), MSIZE), M_NOWAIT); if (cl_refcntmap == NULL) goto bad; /* * Initialize general mbuf statistics. */ mbstat.m_msize = MSIZE; mbstat.m_mclbytes = MCLBYTES; mbstat.m_minclsize = MINCLSIZE; mbstat.m_mlen = MLEN; mbstat.m_mhlen = MHLEN; mbstat.m_numtypes = MT_NTYPES; /* * Allocate and initialize PCPU containers. */ for (i = 0; i < NCPU; i++) { if (CPU_ABSENT(i)) continue; mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list), M_MBUF, M_NOWAIT); mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list), M_MBUF, M_NOWAIT); if ((mb_list_mbuf.ml_cntlst[i] == NULL) || (mb_list_clust.ml_cntlst[i] == NULL)) goto bad; mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", NULL, 0); mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock = mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i]; mb_statpcpu[i].mb_active = 1; mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner = mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i; mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved = mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0; mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount = &(mb_statpcpu[i].mb_mbfree); mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount = &(mb_statpcpu[i].mb_clfree); mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numpgs = &(mb_statpcpu[i].mb_mbpgs); mb_list_clust.ml_cntlst[i]->mb_cont.mc_numpgs = &(mb_statpcpu[i].mb_clpgs); mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types = &(mb_statpcpu[i].mb_mbtypes[0]); mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL; SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead)); SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead)); /* * Perform initial allocations. */ pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i); MB_LOCK_CONT(pcpu_cnt); for (j = 0; j < NMB_MBUF_INIT; j++) { if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt) == NULL) goto bad; } MB_UNLOCK_CONT(pcpu_cnt); pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i); MB_LOCK_CONT(pcpu_cnt); for (j = 0; j < NMB_CLUST_INIT; j++) { if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt) == NULL) goto bad; } MB_UNLOCK_CONT(pcpu_cnt); } return; bad: panic("mbuf_init(): failed to initialize mbuf subsystem!"); } /* * Populate a given mbuf PCPU container with a bucket full of fresh new * buffers. Return a pointer to the new bucket (already in the container if * successful), or return NULL on failure. * * LOCKING NOTES: * PCPU container lock must be held when this is called. * The lock is dropped here so that we can cleanly call the underlying VM * code. If we fail, we return with no locks held. If we succeed (i.e., return * non-NULL), we return with the PCPU lock held, ready for allocation from * the returned bucket. */ static struct mb_bucket * mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst) { struct mb_bucket *bucket; caddr_t p; int i; MB_UNLOCK_CONT(cnt_lst); /* * If our object's (finite) map is starved now (i.e., no more address * space), bail out now. */ if (mb_list->ml_mapfull) return (NULL); bucket = malloc(sizeof(struct mb_bucket) + PAGE_SIZE / mb_list->ml_objsize * sizeof(void *), M_MBUF, how == M_TRYWAIT ? M_WAITOK : M_NOWAIT); if (bucket == NULL) return (NULL); p = (caddr_t)kmem_malloc(mb_list->ml_map, PAGE_SIZE, how == M_TRYWAIT ? M_WAITOK : M_NOWAIT); if (p == NULL) { free(bucket, M_MBUF); if (how == M_TRYWAIT) mb_list->ml_mapfull = 1; return (NULL); } bucket->mb_numfree = 0; mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket; for (i = 0; i < (PAGE_SIZE / mb_list->ml_objsize); i++) { bucket->mb_free[i] = p; bucket->mb_numfree++; p += mb_list->ml_objsize; } MB_LOCK_CONT(cnt_lst); bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist); (*(cnt_lst->mb_cont.mc_numpgs))++; *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree; return (bucket); } /* * Allocate an mbuf-subsystem type object. * The general case is very easy. Complications only arise if our PCPU * container is empty. Things get worse if the PCPU container is empty, * the general container is empty, and we've run out of address space * in our map; then we try to block if we're willing to (M_TRYWAIT). */ static __inline void * mb_alloc(struct mb_lstmngr *mb_list, int how, short type, short persist, int *pers_list) { static int last_report; struct mb_pcpu_list *cnt_lst; struct mb_bucket *bucket; void *m; m = NULL; if ((persist & MBP_PERSISTENT) != 0) { /* * If we're a "persistent" call, then the per-CPU #(pers_list) * cache lock is already held, and we just need to refer to * the correct cache descriptor. */ cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list); } else { cnt_lst = MB_GET_PCPU_LIST(mb_list); MB_LOCK_CONT(cnt_lst); } if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) { /* * This is the easy allocation case. We just grab an object * from a bucket in the PCPU container. At worst, we * have just emptied the bucket and so we remove it * from the container. */ MB_GET_OBJECT(m, bucket, cnt_lst); MB_MBTYPES_INC(cnt_lst, type, 1); /* If asked to persist, do not drop the lock. */ if ((persist & MBP_PERSIST) == 0) MB_UNLOCK_CONT(cnt_lst); else *pers_list = cnt_lst->mb_cont.mc_numowner; } else { struct mb_gen_list *gen_list; /* * This is the less-common more difficult case. We must * first verify if the general list has anything for us * and if that also fails, we must allocate a page from * the map and create a new bucket to place in our PCPU * container (already locked). If the map is starved then * we're really in for trouble, as we have to wait on * the general container's condition variable. */ gen_list = MB_GET_GEN_LIST(mb_list); MB_LOCK_CONT(gen_list); if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) { /* * Give ownership of the bucket to our CPU's * container, but only actually put the bucket * in the container if it doesn't become free * upon removing an mbuf from it. */ SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead), mb_blist); bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; (*(gen_list->mb_cont.mc_numpgs))--; (*(cnt_lst->mb_cont.mc_numpgs))++; *(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree; bucket->mb_numfree--; m = bucket->mb_free[(bucket->mb_numfree)]; if (bucket->mb_numfree == 0) { SLIST_NEXT(bucket, mb_blist) = NULL; bucket->mb_owner |= MB_BUCKET_FREE; } else { SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist); *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree; } MB_UNLOCK_CONT(gen_list); MB_MBTYPES_INC(cnt_lst, type, 1); /* If asked to persist, do not drop the lock. */ if ((persist & MBP_PERSIST) == 0) MB_UNLOCK_CONT(cnt_lst); else *pers_list = cnt_lst->mb_cont.mc_numowner; } else { /* * We'll have to allocate a new page. */ MB_UNLOCK_CONT(gen_list); bucket = mb_pop_cont(mb_list, how, cnt_lst); if (bucket != NULL) { MB_GET_OBJECT(m, bucket, cnt_lst); MB_MBTYPES_INC(cnt_lst, type, 1); /* If asked to persist, do not drop the lock. */ if ((persist & MBP_PERSIST) == 0) MB_UNLOCK_CONT(cnt_lst); else *pers_list=cnt_lst->mb_cont.mc_numowner; } else { if (how == M_TRYWAIT) { /* * Absolute worst-case scenario. * We block if we're willing to, but * only after trying to steal from * other lists. */ m = mb_alloc_wait(mb_list, type); } else { /* XXX: No consistency. */ mbstat.m_drops++; if (ticks < last_report || (ticks - last_report) >= hz) { last_report = ticks; printf( "All mbufs or mbuf clusters exhausted, please see tuning(7).\n"); } } if (m != NULL && (persist & MBP_PERSIST) != 0) { cnt_lst = MB_GET_PCPU_LIST(mb_list); MB_LOCK_CONT(cnt_lst); *pers_list=cnt_lst->mb_cont.mc_numowner; } } } } return (m); } /* * This is the worst-case scenario called only if we're allocating with * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf * by looking in every PCPU container. If we're still unsuccesful, we * try the general container one last time and possibly block on our * starved cv. */ static void * mb_alloc_wait(struct mb_lstmngr *mb_list, short type) { struct mb_pcpu_list *cnt_lst; struct mb_gen_list *gen_list; struct mb_bucket *bucket; void *m; int i, cv_ret; /* * Try to reclaim mbuf-related objects (mbufs, clusters). */ mb_reclaim(); /* * Cycle all the PCPU containers. Increment starved counts if found * empty. */ for (i = 0; i < NCPU; i++) { if (CPU_ABSENT(i)) continue; cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i); MB_LOCK_CONT(cnt_lst); /* * If container is non-empty, get a single object from it. * If empty, increment starved count. */ if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) { MB_GET_OBJECT(m, bucket, cnt_lst); MB_MBTYPES_INC(cnt_lst, type, 1); MB_UNLOCK_CONT(cnt_lst); mbstat.m_wait++; /* XXX: No consistency. */ return (m); } else cnt_lst->mb_cont.mc_starved++; MB_UNLOCK_CONT(cnt_lst); } /* * We're still here, so that means it's time to get the general * container lock, check it one more time (now that mb_reclaim() * has been called) and if we still get nothing, block on the cv. */ gen_list = MB_GET_GEN_LIST(mb_list); MB_LOCK_CONT(gen_list); if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) { MB_GET_OBJECT(m, bucket, gen_list); MB_MBTYPES_INC(gen_list, type, 1); MB_UNLOCK_CONT(gen_list); mbstat.m_wait++; /* XXX: No consistency. */ return (m); } gen_list->mb_cont.mc_starved++; cv_ret = cv_timedwait(&(gen_list->mgl_mstarved), gen_list->mb_cont.mc_lock, mbuf_wait); gen_list->mb_cont.mc_starved--; if ((cv_ret == 0) && ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) { MB_GET_OBJECT(m, bucket, gen_list); MB_MBTYPES_INC(gen_list, type, 1); mbstat.m_wait++; /* XXX: No consistency. */ } else { mbstat.m_drops++; /* XXX: No consistency. */ m = NULL; } MB_UNLOCK_CONT(gen_list); return (m); } /*- * Free an object to its rightful container. * In the very general case, this operation is really very easy. * Complications arise primarily if: * (a) We've hit the high limit on number of free objects allowed in * our PCPU container. * (b) We're in a critical situation where our container has been * marked 'starved' and we need to issue wakeups on the starved * condition variable. * (c) Minor (odd) cases: our bucket has migrated while we were * waiting for the lock; our bucket is in the general container; * our bucket is empty. */ static __inline void mb_free(struct mb_lstmngr *mb_list, void *m, short type, short persist, int *pers_list) { struct mb_pcpu_list *cnt_lst; struct mb_gen_list *gen_list; struct mb_bucket *bucket; u_int owner; bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)]; /* * Make sure that if after we lock the bucket's present container the * bucket has migrated, that we drop the lock and get the new one. */ retry_lock: owner = bucket->mb_owner & ~MB_BUCKET_FREE; switch (owner) { case MB_GENLIST_OWNER: gen_list = MB_GET_GEN_LIST(mb_list); if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) { if (*pers_list != MB_GENLIST_OWNER) { cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list); MB_UNLOCK_CONT(cnt_lst); MB_LOCK_CONT(gen_list); } } else { MB_LOCK_CONT(gen_list); } if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) { MB_UNLOCK_CONT(gen_list); *pers_list = -1; goto retry_lock; } /* * If we're intended for the general container, this is * real easy: no migrating required. The only `bogon' * is that we're now contending with all the threads * dealing with the general list, but this is expected. */ MB_PUT_OBJECT(m, bucket, gen_list); MB_MBTYPES_DEC(gen_list, type, 1); if (gen_list->mb_cont.mc_starved > 0) cv_signal(&(gen_list->mgl_mstarved)); if ((persist & MBP_PERSIST) == 0) MB_UNLOCK_CONT(gen_list); else *pers_list = MB_GENLIST_OWNER; break; default: cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner); if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) { if (*pers_list == MB_GENLIST_OWNER) { gen_list = MB_GET_GEN_LIST(mb_list); MB_UNLOCK_CONT(gen_list); MB_LOCK_CONT(cnt_lst); } else { cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list); owner = *pers_list; } } else { MB_LOCK_CONT(cnt_lst); } if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) { MB_UNLOCK_CONT(cnt_lst); *pers_list = -1; goto retry_lock; } MB_PUT_OBJECT(m, bucket, cnt_lst); MB_MBTYPES_DEC(cnt_lst, type, 1); if (cnt_lst->mb_cont.mc_starved > 0) { /* * This is a tough case. It means that we've * been flagged at least once to indicate that * we're empty, and that the system is in a critical * situation, so we ought to migrate at least one * bucket over to the general container. * There may or may not be a thread blocking on * the starved condition variable, but chances * are that one will eventually come up soon so * it's better to migrate now than never. */ gen_list = MB_GET_GEN_LIST(mb_list); MB_LOCK_CONT(gen_list); KASSERT((bucket->mb_owner & MB_BUCKET_FREE) != 0, ("mb_free: corrupt bucket %p\n", bucket)); SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead), bucket, mb_blist); bucket->mb_owner = MB_GENLIST_OWNER; (*(cnt_lst->mb_cont.mc_objcount))--; (*(gen_list->mb_cont.mc_objcount))++; (*(cnt_lst->mb_cont.mc_numpgs))--; (*(gen_list->mb_cont.mc_numpgs))++; /* * Determine whether or not to keep transferring * buckets to the general list or whether we've * transferred enough already. * We realize that although we may flag another * bucket to be migrated to the general container * that in the meantime, the thread that was * blocked on the cv is already woken up and * long gone. But in that case, the worst * consequence is that we will end up migrating * one bucket too many, which is really not a big * deal, especially if we're close to a critical * situation. */ if (gen_list->mb_cont.mc_starved > 0) { cnt_lst->mb_cont.mc_starved--; cv_signal(&(gen_list->mgl_mstarved)); } else cnt_lst->mb_cont.mc_starved = 0; MB_UNLOCK_CONT(gen_list); if ((persist & MBP_PERSIST) == 0) MB_UNLOCK_CONT(cnt_lst); else *pers_list = owner; break; } if (*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) { /* * We've hit the high limit of allowed numbers of mbufs * on this PCPU list. We must now migrate a bucket * over to the general container. */ gen_list = MB_GET_GEN_LIST(mb_list); MB_LOCK_CONT(gen_list); if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) { bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead)); SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead), mb_blist); } SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead), bucket, mb_blist); bucket->mb_owner = MB_GENLIST_OWNER; *(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree; *(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree; (*(cnt_lst->mb_cont.mc_numpgs))--; (*(gen_list->mb_cont.mc_numpgs))++; /* * While we're at it, transfer some of the mbtypes * "count load" onto the general list's mbtypes * array, seeing as how we're moving the bucket * there now, meaning that the freeing of objects * there will now decrement the _general list's_ * mbtypes counters, and no longer our PCPU list's * mbtypes counters. We do this for the type presently * being freed in an effort to keep the mbtypes * counters approximately balanced across all lists. */ MB_MBTYPES_DEC(cnt_lst, type, (PAGE_SIZE / mb_list->ml_objsize) - bucket->mb_numfree); MB_MBTYPES_INC(gen_list, type, (PAGE_SIZE / mb_list->ml_objsize) - bucket->mb_numfree); MB_UNLOCK_CONT(gen_list); if ((persist & MBP_PERSIST) == 0) MB_UNLOCK_CONT(cnt_lst); else *pers_list = owner; break; } if (bucket->mb_owner & MB_BUCKET_FREE) { SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist); bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; } if ((persist & MBP_PERSIST) == 0) MB_UNLOCK_CONT(cnt_lst); else *pers_list = owner; break; } } /* * Drain protocols in hopes to free up some resources. * * LOCKING NOTES: * No locks should be held when this is called. The drain routines have to * presently acquire some locks which raises the possibility of lock order * violation if we're holding any mutex if that mutex is acquired in reverse * order relative to one of the locks in the drain routines. */ static void mb_reclaim(void) { struct domain *dp; struct protosw *pr; /* * XXX: Argh, we almost always trip here with witness turned on now-a-days * XXX: because we often come in with Giant held. For now, there's no way * XXX: to avoid this. */ #ifdef WITNESS KASSERT(witness_list(curthread) == 0, ("mb_reclaim() called with locks held")); #endif mbstat.m_drain++; /* XXX: No consistency. */ for (dp = domains; dp != NULL; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_drain != NULL) (*pr->pr_drain)(); } /****************************************************************************** * Internal setup macros. */ #define _mb_setup(m, type) do { \ (m)->m_type = (type); \ (m)->m_next = NULL; \ (m)->m_nextpkt = NULL; \ (m)->m_data = (m)->m_dat; \ (m)->m_flags = 0; \ } while (0) #define _mbhdr_setup(m, type) do { \ (m)->m_type = (type); \ (m)->m_next = NULL; \ (m)->m_nextpkt = NULL; \ (m)->m_data = (m)->m_pktdat; \ (m)->m_flags = M_PKTHDR; \ (m)->m_pkthdr.rcvif = NULL; \ (m)->m_pkthdr.csum_flags = 0; \ - (m)->m_pkthdr.aux = NULL; \ + SLIST_INIT(&(m)->m_pkthdr.tags); \ } while (0) #define _mcl_setup(m) do { \ (m)->m_data = (m)->m_ext.ext_buf; \ (m)->m_flags |= M_EXT; \ (m)->m_ext.ext_free = NULL; \ (m)->m_ext.ext_args = NULL; \ (m)->m_ext.ext_size = MCLBYTES; \ (m)->m_ext.ext_type = EXT_CLUSTER; \ } while (0) #define _mext_init_ref(m, ref) do { \ (m)->m_ext.ref_cnt = ((ref) == NULL) ? \ malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)(ref); \ if ((m)->m_ext.ref_cnt != NULL) { \ *((m)->m_ext.ref_cnt) = 0; \ MEXT_ADD_REF((m)); \ } \ } while (0) #define cl2ref(cl) \ (((uintptr_t)(cl) - (uintptr_t)cl_refcntmap) >> MCLSHIFT) #define _mext_dealloc_ref(m) \ free((m)->m_ext.ref_cnt, M_MBUF) /****************************************************************************** * Internal routines. * * Because mb_alloc() and mb_free() are inlines (to keep the common * cases down to a maximum of one function call), below are a few * routines used only internally for the sole purpose of making certain * functions smaller. * * - _mext_free(): frees associated storage when the ref. count is * exactly one and we're freeing. * * - _mgetm_internal(): common "persistent-lock" routine that allocates * an mbuf and a cluster in one shot, but where the lock is already * held coming in (which is what makes it different from the exported * m_getcl()). The lock is dropped when done. This is used by m_getm() * and, therefore, is very m_getm()-specific. */ static struct mbuf *_mgetm_internal(int, short, short, int); void _mext_free(struct mbuf *mb) { if (mb->m_ext.ext_type == EXT_CLUSTER) { mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF, 0, NULL); } else { (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args); _mext_dealloc_ref(mb); } } static struct mbuf * _mgetm_internal(int how, short type, short persist, int cchnum) { struct mbuf *mb; mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, persist,&cchnum); if (mb == NULL) return NULL; _mb_setup(mb, type); if ((persist & MBP_PERSIST) != 0) { mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how, MT_NOTMBUF, MBP_PERSISTENT, &cchnum); if (mb->m_ext.ext_buf == NULL) { (void)m_free(mb); mb = NULL; } _mcl_setup(mb); _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); } return (mb); } /****************************************************************************** * Exported buffer allocation and de-allocation routines. */ /* * Allocate and return a single (normal) mbuf. NULL is returned on failure. * * Arguments: * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks * if really starved for memory. M_DONTWAIT to never block. * - type: the type of the mbuf being allocated. */ struct mbuf * m_get(int how, short type) { struct mbuf *mb; mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); if (mb != NULL) _mb_setup(mb, type); return (mb); } /* * Allocate a given length worth of mbufs and/or clusters (whatever fits * best) and return a pointer to the top of the allocated chain. If an * existing mbuf chain is provided, then we will append the new chain * to the existing one and return the top of the provided (existing) * chain. NULL is returned on failure, in which case the [optional] * provided chain is left untouched, and any memory already allocated * is freed. * * Arguments: * - m: existing chain to which to append new chain (optional). * - len: total length of data to append, either in mbufs or clusters * (we allocate whatever combination yields the best fit). * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks * if really starved for memory. M_DONTWAIT to never block. * - type: the type of the mbuf being allocated. */ struct mbuf * m_getm(struct mbuf *m, int len, int how, short type) { struct mbuf *mb, *top, *cur, *mtail; int num, rem, cchnum; short persist; int i; KASSERT(len >= 0, ("m_getm(): len is < 0")); /* If m != NULL, we will append to the end of that chain. */ if (m != NULL) for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next); else mtail = NULL; /* * In the best-case scenario (which should be the common case * unless we're in a starvation situation), we will be able to * go through the allocation of all the desired mbufs and clusters * here without dropping our per-CPU cache lock in between. */ num = len / MCLBYTES; rem = len % MCLBYTES; persist = 0; cchnum = -1; top = cur = NULL; for (i = 0; i < num; i++) { mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, MBP_PERSIST | persist, &cchnum); if (mb == NULL) goto failed; _mb_setup(mb, type); persist = (i != (num - 1) || rem > 0) ? MBP_PERSIST : 0; mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how, MT_NOTMBUF, persist | MBP_PERSISTENT, &cchnum); if (mb->m_ext.ext_buf == NULL) { (void)m_free(mb); goto failed; } _mcl_setup(mb); _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); persist = MBP_PERSISTENT; if (cur == NULL) top = cur = mb; else cur->m_next = mb; } if (rem > 0) { if (cchnum >= 0) { persist = MBP_PERSISTENT; persist |= (rem > MINCLSIZE) ? MBP_PERSIST : 0; mb = _mgetm_internal(how, type, persist, cchnum); if (mb == NULL) goto failed; } else if (rem > MINCLSIZE) { mb = m_getcl(how, type, 0); } else { mb = m_get(how, type); } if (mb != NULL) { if (cur == NULL) top = mb; else cur->m_next = mb; } else goto failed; } if (mtail != NULL) mtail->m_next = top; else mtail = top; return mtail; failed: if (top != NULL) m_freem(top); return NULL; } /* * Allocate and return a single M_PKTHDR mbuf. NULL is returned on failure. * * Arguments: * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks * if really starved for memory. M_DONTWAIT to never block. * - type: the type of the mbuf being allocated. */ struct mbuf * m_gethdr(int how, short type) { struct mbuf *mb; mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); if (mb != NULL) { _mbhdr_setup(mb, type); #ifdef MAC if (mac_init_mbuf(mb, how) != 0) { m_free(mb); return NULL; } #endif } return (mb); } /* * Allocate and return a single (normal) pre-zero'd mbuf. NULL is * returned on failure. * * Arguments: * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks * if really starved for memory. M_DONTWAIT to never block. * - type: the type of the mbuf being allocated. */ struct mbuf * m_get_clrd(int how, short type) { struct mbuf *mb; mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); if (mb != NULL) { _mb_setup(mb, type); bzero(mtod(mb, caddr_t), MLEN); } return (mb); } /* * Allocate and return a single M_PKTHDR pre-zero'd mbuf. NULL is * returned on failure. * * Arguments: * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks * if really starved for memory. M_DONTWAIT to never block. * - type: the type of the mbuf being allocated. */ struct mbuf * m_gethdr_clrd(int how, short type) { struct mbuf *mb; mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); if (mb != NULL) { _mbhdr_setup(mb, type); #ifdef MAC if (mac_init_mbuf(mb, how) != 0) { m_free(mb); return NULL; } #endif bzero(mtod(mb, caddr_t), MHLEN); } return (mb); } /* * Free a single mbuf and any associated storage that it may have attached * to it. The associated storage may not be immediately freed if its * reference count is above 1. Returns the next mbuf in the chain following * the mbuf being freed. * * Arguments: * - mb: the mbuf to free. */ struct mbuf * m_free(struct mbuf *mb) { struct mbuf *nb; int cchnum; short persist = 0; - /* XXX: This check is bogus... please fix (see KAME). */ - if ((mb->m_flags & M_PKTHDR) != 0 && mb->m_pkthdr.aux) { - m_freem(mb->m_pkthdr.aux); - mb->m_pkthdr.aux = NULL; - } + if ((mb->m_flags & M_PKTHDR) != 0) + m_tag_delete_chain(mb, NULL); #ifdef MAC if ((mb->m_flags & M_PKTHDR) && (mb->m_pkthdr.label.l_flags & MAC_FLAG_INITIALIZED)) mac_destroy_mbuf(mb); #endif nb = mb->m_next; if ((mb->m_flags & M_EXT) != 0) { MEXT_REM_REF(mb); if (atomic_cmpset_int(mb->m_ext.ref_cnt, 0, 1)) { if (mb->m_ext.ext_type == EXT_CLUSTER) { mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF, MBP_PERSIST, &cchnum); persist = MBP_PERSISTENT; } else { (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args); _mext_dealloc_ref(mb); persist = 0; } } } mb_free(&mb_list_mbuf, mb, mb->m_type, persist, &cchnum); return (nb); } /* * Free an entire chain of mbufs and associated external buffers, if * applicable. Right now, we only optimize a little so that the cache * lock may be held across a single mbuf+cluster free. Hopefully, * we'll eventually be holding the lock across more than merely two * consecutive frees but right now this is hard to implement because of * things like _mext_dealloc_ref (may do a free()) and atomic ops in the - * loop, as well as the fact that we may recurse on m_freem() in - * m_pkthdr.aux != NULL cases. + * loop. * * - mb: the mbuf chain to free. */ void m_freem(struct mbuf *mb) { struct mbuf *m; int cchnum; short persist; while (mb != NULL) { - /* XXX: This check is bogus... please fix (see KAME). */ - if ((mb->m_flags & M_PKTHDR) != 0 && mb->m_pkthdr.aux) { - m_freem(mb->m_pkthdr.aux); - mb->m_pkthdr.aux = NULL; - } + if ((mb->m_flags & M_PKTHDR) != 0) + m_tag_delete_chain(mb, NULL); #ifdef MAC if ((mb->m_flags & M_PKTHDR) && (mb->m_pkthdr.label.l_flags & MAC_FLAG_INITIALIZED)) mac_destroy_mbuf(mb); #endif persist = 0; m = mb; mb = mb->m_next; if ((m->m_flags & M_EXT) != 0) { MEXT_REM_REF(m); if (atomic_cmpset_int(m->m_ext.ref_cnt, 0, 1)) { if (m->m_ext.ext_type == EXT_CLUSTER) { mb_free(&mb_list_clust, (caddr_t)m->m_ext.ext_buf, MT_NOTMBUF, MBP_PERSIST, &cchnum); persist = MBP_PERSISTENT; } else { (*(m->m_ext.ext_free))(m->m_ext.ext_buf, m->m_ext.ext_args); _mext_dealloc_ref(m); persist = 0; } } } mb_free(&mb_list_mbuf, m, m->m_type, persist, &cchnum); } } /* * Fetch an mbuf with a cluster attached to it. If one of the * allocations fails, the entire allocation fails. This routine is * the preferred way of fetching both the mbuf and cluster together, * as it avoids having to unlock/relock between allocations. Returns * NULL on failure. * * Arguments: * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks * if really starved for memory. M_DONTWAIT to never block. * - type: the type of the mbuf being allocated. * - flags: any flags to pass to the mbuf being allocated; if this includes * the M_PKTHDR bit, then the mbuf is configured as a M_PKTHDR mbuf. */ struct mbuf * m_getcl(int how, short type, int flags) { struct mbuf *mb; int cchnum; mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, MBP_PERSIST, &cchnum); if (mb == NULL) return NULL; mb->m_type = type; mb->m_next = NULL; mb->m_flags = flags; if ((flags & M_PKTHDR) != 0) { mb->m_nextpkt = NULL; mb->m_pkthdr.rcvif = NULL; mb->m_pkthdr.csum_flags = 0; - mb->m_pkthdr.aux = NULL; + SLIST_INIT(&mb->m_pkthdr.tags); } mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how, MT_NOTMBUF, MBP_PERSISTENT, &cchnum); if (mb->m_ext.ext_buf == NULL) { (void)m_free(mb); mb = NULL; } else { _mcl_setup(mb); _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); } #ifdef MAC if ((flags & M_PKTHDR) && (mac_init_mbuf(mb, how) != 0)) { m_free(mb); return NULL; } #endif return (mb); } /* * Fetch a single mbuf cluster and attach it to an existing mbuf. If * successfull, configures the provided mbuf to have mbuf->m_ext.ext_buf * pointing to the cluster, and sets the M_EXT bit in the mbuf's flags. * The M_EXT bit is not set on failure. * * Arguments: * - mb: the existing mbuf to which to attach the allocated cluster. * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks * if really starved for memory. M_DONTWAIT to never block. */ void m_clget(struct mbuf *mb, int how) { mb->m_ext.ext_buf= (caddr_t)mb_alloc(&mb_list_clust,how,MT_NOTMBUF, 0, NULL); if (mb->m_ext.ext_buf != NULL) { _mcl_setup(mb); _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); } } /* * Configure a provided mbuf to refer to the provided external storage * buffer and setup a reference count for said buffer. If the setting * up of the reference count fails, the M_EXT bit will not be set. If * successfull, the M_EXT bit is set in the mbuf's flags. * * Arguments: * - mb: the existing mbuf to which to attach the provided buffer. * - buf: the address of the provided external storage buffer. * - size: the size of the provided buffer. * - freef: a pointer to a routine that is responsible for freeing the * provided external storage buffer. * - args: a pointer to an argument structure (of any type) to be passed * to the provided freef routine (may be NULL). * - flags: any other flags to be passed to the provided mbuf. * - type: the type that the external storage buffer should be labeled with. */ void m_extadd(struct mbuf *mb, caddr_t buf, u_int size, void (*freef)(void *, void *), void *args, int flags, int type) { _mext_init_ref(mb, ((type != EXT_CLUSTER) ? NULL : &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)])); if (mb->m_ext.ref_cnt != NULL) { mb->m_flags |= (M_EXT | flags); mb->m_ext.ext_buf = buf; mb->m_data = mb->m_ext.ext_buf; mb->m_ext.ext_size = size; mb->m_ext.ext_free = freef; mb->m_ext.ext_args = args; mb->m_ext.ext_type = type; } } /* * Change type of provided mbuf. This is a relatively expensive operation * (due to the cost of statistics manipulations) and should be avoided, where * possible. * * Arguments: * - mb: the provided mbuf for which the type needs to be changed. * - new_type: the new type to change the mbuf to. */ void m_chtype(struct mbuf *mb, short new_type) { struct mb_gen_list *gen_list; gen_list = MB_GET_GEN_LIST(&mb_list_mbuf); MB_LOCK_CONT(gen_list); MB_MBTYPES_DEC(gen_list, mb->m_type, 1); MB_MBTYPES_INC(gen_list, new_type, 1); MB_UNLOCK_CONT(gen_list); mb->m_type = new_type; } Index: head/sys/kern/uipc_mbuf.c =================================================================== --- head/sys/kern/uipc_mbuf.c (revision 105193) +++ head/sys/kern/uipc_mbuf.c (revision 105194) @@ -1,739 +1,739 @@ /* * Copyright (c) 1982, 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 * $FreeBSD$ */ #include "opt_mac.h" #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include int max_linkhdr; int max_protohdr; int max_hdr; int max_datalen; /* * sysctl(8) exported objects */ SYSCTL_DECL(_kern_ipc); SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW, &max_linkhdr, 0, ""); SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW, &max_protohdr, 0, ""); SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, ""); SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW, &max_datalen, 0, ""); /* * Copy mbuf pkthdr from "from" to "to". * "from" must have M_PKTHDR set, and "to" must be empty. * aux pointer will be moved to "to". */ void m_copy_pkthdr(struct mbuf *to, struct mbuf *from) { #if 0 KASSERT(to->m_flags & M_PKTHDR, ("m_copy_pkthdr() called on non-header")); #endif #ifdef MAC if (to->m_flags & M_PKTHDR) mac_destroy_mbuf(to); #endif to->m_data = to->m_pktdat; to->m_flags = from->m_flags & M_COPYFLAGS; to->m_pkthdr = from->m_pkthdr; #ifdef MAC mac_init_mbuf(to, 1); /* XXXMAC no way to fail */ mac_create_mbuf_from_mbuf(from, to); #endif - from->m_pkthdr.aux = NULL; + SLIST_INIT(&from->m_pkthdr.tags); } /* * Lesser-used path for M_PREPEND: * allocate new mbuf to prepend to chain, * copy junk along. */ struct mbuf * m_prepend(struct mbuf *m, int len, int how) { struct mbuf *mn; MGET(mn, how, m->m_type); if (mn == NULL) { m_freem(m); return (NULL); } if (m->m_flags & M_PKTHDR) { M_COPY_PKTHDR(mn, m); #ifdef MAC mac_destroy_mbuf(m); #endif m->m_flags &= ~M_PKTHDR; } mn->m_next = m; m = mn; if (len < MHLEN) MH_ALIGN(m, len); m->m_len = len; return (m); } /* * Make a copy of an mbuf chain starting "off0" bytes from the beginning, * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. * The wait parameter is a choice of M_TRYWAIT/M_DONTWAIT from caller. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. */ struct mbuf * m_copym(struct mbuf *m, int off0, int len, int wait) { struct mbuf *n, **np; int off = off0; struct mbuf *top; int copyhdr = 0; KASSERT(off >= 0, ("m_copym, negative off %d", off)); KASSERT(len >= 0, ("m_copym, negative len %d", len)); if (off == 0 && m->m_flags & M_PKTHDR) copyhdr = 1; while (off > 0) { KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } np = ⊤ top = 0; while (len > 0) { if (m == NULL) { KASSERT(len == M_COPYALL, ("m_copym, length > size of mbuf chain")); break; } MGET(n, wait, m->m_type); *np = n; if (n == NULL) goto nospace; if (copyhdr) { M_COPY_PKTHDR(n, m); if (len == M_COPYALL) n->m_pkthdr.len -= off0; else n->m_pkthdr.len = len; copyhdr = 0; } n->m_len = min(len, m->m_len - off); if (m->m_flags & M_EXT) { n->m_data = m->m_data + off; n->m_ext = m->m_ext; n->m_flags |= M_EXT; MEXT_ADD_REF(m); } else bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), (u_int)n->m_len); if (len != M_COPYALL) len -= n->m_len; off = 0; m = m->m_next; np = &n->m_next; } if (top == NULL) mbstat.m_mcfail++; /* XXX: No consistency. */ return (top); nospace: m_freem(top); mbstat.m_mcfail++; /* XXX: No consistency. */ return (NULL); } /* * Copy an entire packet, including header (which must be present). * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. * Preserve alignment of the first mbuf so if the creator has left * some room at the beginning (e.g. for inserting protocol headers) * the copies still have the room available. */ struct mbuf * m_copypacket(struct mbuf *m, int how) { struct mbuf *top, *n, *o; MGET(n, how, m->m_type); top = n; if (n == NULL) goto nospace; M_COPY_PKTHDR(n, m); n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; n->m_ext = m->m_ext; n->m_flags |= M_EXT; MEXT_ADD_REF(m); } else { n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; while (m) { MGET(o, how, m->m_type); if (o == NULL) goto nospace; n->m_next = o; n = n->m_next; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; n->m_ext = m->m_ext; n->m_flags |= M_EXT; MEXT_ADD_REF(m); } else { bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; } return top; nospace: m_freem(top); mbstat.m_mcfail++; /* XXX: No consistency. */ return (NULL); } /* * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. */ void m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) { u_int count; KASSERT(off >= 0, ("m_copydata, negative off %d", off)); KASSERT(len >= 0, ("m_copydata, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); count = min(m->m_len - off, len); bcopy(mtod(m, caddr_t) + off, cp, count); len -= count; cp += count; off = 0; m = m->m_next; } } /* * Copy a packet header mbuf chain into a completely new chain, including * copying any mbuf clusters. Use this instead of m_copypacket() when * you need a writable copy of an mbuf chain. */ struct mbuf * m_dup(struct mbuf *m, int how) { struct mbuf **p, *top = NULL; int remain, moff, nsize; /* Sanity check */ if (m == NULL) return (NULL); KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__)); /* While there's more data, get a new mbuf, tack it on, and fill it */ remain = m->m_pkthdr.len; moff = 0; p = ⊤ while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ struct mbuf *n; /* Get the next new mbuf */ MGET(n, how, m->m_type); if (n == NULL) goto nospace; if (top == NULL) { /* first one, must be PKTHDR */ M_COPY_PKTHDR(n, m); nsize = MHLEN; } else /* not the first one */ nsize = MLEN; if (remain >= MINCLSIZE) { MCLGET(n, how); if ((n->m_flags & M_EXT) == 0) { (void)m_free(n); goto nospace; } nsize = MCLBYTES; } n->m_len = 0; /* Link it into the new chain */ *p = n; p = &n->m_next; /* Copy data from original mbuf(s) into new mbuf */ while (n->m_len < nsize && m != NULL) { int chunk = min(nsize - n->m_len, m->m_len - moff); bcopy(m->m_data + moff, n->m_data + n->m_len, chunk); moff += chunk; n->m_len += chunk; remain -= chunk; if (moff == m->m_len) { m = m->m_next; moff = 0; } } /* Check correct total mbuf length */ KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), ("%s: bogus m_pkthdr.len", __func__)); } return (top); nospace: m_freem(top); mbstat.m_mcfail++; /* XXX: No consistency. */ return (NULL); } /* * Concatenate mbuf chain n to m. * Both chains must be of the same type (e.g. MT_DATA). * Any m_pkthdr is not updated. */ void m_cat(struct mbuf *m, struct mbuf *n) { while (m->m_next) m = m->m_next; while (n) { if (m->m_flags & M_EXT || m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) { /* just join the two chains */ m->m_next = n; return; } /* splat the data from one into the other */ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)n->m_len); m->m_len += n->m_len; n = m_free(n); } } void m_adj(struct mbuf *mp, int req_len) { int len = req_len; struct mbuf *m; int count; if ((m = mp) == NULL) return; if (len >= 0) { /* * Trim from head. */ while (m != NULL && len > 0) { if (m->m_len <= len) { len -= m->m_len; m->m_len = 0; m = m->m_next; } else { m->m_len -= len; m->m_data += len; len = 0; } } m = mp; if (mp->m_flags & M_PKTHDR) m->m_pkthdr.len -= (req_len - len); } else { /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ len = -len; count = 0; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len >= len) { m->m_len -= len; if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= len; return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ m = mp; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len = count; for (; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; break; } count -= m->m_len; } while (m->m_next) (m = m->m_next) ->m_len = 0; } } /* * Rearange an mbuf chain so that len bytes are contiguous * and in the data area of an mbuf (so that mtod and dtom * will work for a structure of size len). Returns the resulting * mbuf chain on success, frees it and returns null on failure. * If there is room, it will add up to max_protohdr-len extra bytes to the * contiguous region in an attempt to avoid being called next time. */ struct mbuf * m_pullup(struct mbuf *n, int len) { struct mbuf *m; int count; int space; /* * If first mbuf has no cluster, and has room for len bytes * without shifting current data, pullup into it, * otherwise allocate a new mbuf to prepend to the chain. */ if ((n->m_flags & M_EXT) == 0 && n->m_data + len < &n->m_dat[MLEN] && n->m_next) { if (n->m_len >= len) return (n); m = n; n = n->m_next; len -= m->m_len; } else { if (len > MHLEN) goto bad; MGET(m, M_DONTWAIT, n->m_type); if (m == NULL) goto bad; m->m_len = 0; if (n->m_flags & M_PKTHDR) { M_COPY_PKTHDR(m, n); n->m_flags &= ~M_PKTHDR; } } space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); mbstat.m_mpfail++; /* XXX: No consistency. */ return (NULL); } /* * Partition an mbuf chain in two pieces, returning the tail -- * all but the first len0 bytes. In case of failure, it returns NULL and * attempts to restore the chain to its original state. * * Note that the resulting mbufs might be read-only, because the new * mbuf can end up sharing an mbuf cluster with the original mbuf if * the "breaking point" happens to lie within a cluster mbuf. Use the * M_WRITABLE() macro to check for this case. */ struct mbuf * m_split(struct mbuf *m0, int len0, int wait) { struct mbuf *m, *n; u_int len = len0, remain; for (m = m0; m && len > m->m_len; m = m->m_next) len -= m->m_len; if (m == NULL) return (NULL); remain = m->m_len - len; if (m0->m_flags & M_PKTHDR) { MGETHDR(n, wait, m0->m_type); if (n == NULL) return (NULL); n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; if (m->m_flags & M_EXT) goto extpacket; if (remain > MHLEN) { /* m can't be the lead packet */ MH_ALIGN(n, 0); n->m_next = m_split(m, len, wait); if (n->m_next == NULL) { (void) m_free(n); return (NULL); } else { n->m_len = 0; return (n); } } else MH_ALIGN(n, remain); } else if (remain == 0) { n = m->m_next; m->m_next = NULL; return (n); } else { MGET(n, wait, m->m_type); if (n == NULL) return (NULL); M_ALIGN(n, remain); } extpacket: if (m->m_flags & M_EXT) { n->m_flags |= M_EXT; n->m_ext = m->m_ext; MEXT_ADD_REF(m); n->m_data = m->m_data + len; } else { bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); } n->m_len = remain; m->m_len = len; n->m_next = m->m_next; m->m_next = NULL; return (n); } /* * Routine to copy from device local memory into mbufs. * Note that `off' argument is offset into first mbuf of target chain from * which to begin copying the data to. */ struct mbuf * m_devget(char *buf, int totlen, int off, struct ifnet *ifp, void (*copy)(char *from, caddr_t to, u_int len)) { struct mbuf *m; struct mbuf *top = 0, **mp = ⊤ int len; if (off < 0 || off > MHLEN) return (NULL); MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) return (NULL); m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = totlen; len = MHLEN; while (totlen > 0) { if (top) { MGET(m, M_DONTWAIT, MT_DATA); if (m == NULL) { m_freem(top); return (NULL); } len = MLEN; } if (totlen + off >= MINCLSIZE) { MCLGET(m, M_DONTWAIT); if (m->m_flags & M_EXT) len = MCLBYTES; } else { /* * Place initial small packet/header at end of mbuf. */ if (top == NULL && totlen + off + max_linkhdr <= len) { m->m_data += max_linkhdr; len -= max_linkhdr; } } if (off) { m->m_data += off; len -= off; off = 0; } m->m_len = len = min(totlen, len); if (copy) copy(buf, mtod(m, caddr_t), (u_int)len); else bcopy(buf, mtod(m, caddr_t), (u_int)len); buf += len; *mp = m; mp = &m->m_next; totlen -= len; } return (top); } /* * Copy data from a buffer back into the indicated mbuf chain, * starting "off" bytes from the beginning, extending the mbuf * chain if necessary. */ void m_copyback(struct mbuf *m0, int off, int len, caddr_t cp) { int mlen; struct mbuf *m = m0, *n; int totlen = 0; if (m0 == NULL) return; while (off > (mlen = m->m_len)) { off -= mlen; totlen += mlen; if (m->m_next == NULL) { n = m_get_clrd(M_DONTWAIT, m->m_type); if (n == NULL) goto out; n->m_len = min(MLEN, len + off); m->m_next = n; } m = m->m_next; } while (len > 0) { mlen = min (m->m_len - off, len); bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen); cp += mlen; len -= mlen; mlen += off; off = 0; totlen += mlen; if (len == 0) break; if (m->m_next == NULL) { n = m_get(M_DONTWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, len); m->m_next = n; } m = m->m_next; } out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) m->m_pkthdr.len = totlen; } void m_print(const struct mbuf *m) { int len; const struct mbuf *m2; len = m->m_pkthdr.len; m2 = m; while (len) { printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-"); len -= m2->m_len; m2 = m2->m_next; } return; } u_int m_fixhdr(struct mbuf *m0) { u_int len; len = m_length(m0, NULL); m0->m_pkthdr.len = len; return (len); } u_int m_length(struct mbuf *m0, struct mbuf **last) { struct mbuf *m; u_int len; len = 0; for (m = m0; m != NULL; m = m->m_next) { len += m->m_len; if (m->m_next == NULL) break; } if (last != NULL) *last = m; return (len); } Index: head/sys/kern/uipc_mbuf2.c =================================================================== --- head/sys/kern/uipc_mbuf2.c (revision 105193) +++ head/sys/kern/uipc_mbuf2.c (revision 105194) @@ -1,404 +1,456 @@ /* $FreeBSD$ */ /* $KAME: uipc_mbuf2.c,v 1.31 2001/11/28 11:08:53 itojun Exp $ */ /* $NetBSD: uipc_mbuf.c,v 1.40 1999/04/01 00:23:25 thorpej Exp $ */ /* * Copyright (C) 1999 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_mbuf.c 8.4 (Berkeley) 2/14/95 */ /*#define PULLDOWN_DEBUG*/ #include #include +#include #include #include #include #include +MALLOC_DEFINE(M_PACKET_TAGS, "tag", "packet-attached information"); + /* can't call it m_dup(), as freebsd[34] uses m_dup() with different arg */ static struct mbuf *m_dup1(struct mbuf *, int, int, int); /* * ensure that [off, off + len) is contiguous on the mbuf chain "m". * packet chain before "off" is kept untouched. * if offp == NULL, the target will start at on resulting chain. * if offp != NULL, the target will start at on resulting chain. * * on error return (NULL return value), original "m" will be freed. * * XXX: M_TRAILINGSPACE/M_LEADINGSPACE only permitted on writable ext_buf. */ struct mbuf * m_pulldown(struct mbuf *m, int off, int len, int *offp) { struct mbuf *n, *o; int hlen, tlen, olen; int writable; /* check invalid arguments. */ if (m == NULL) panic("m == NULL in m_pulldown()"); if (len > MCLBYTES) { m_freem(m); return NULL; /* impossible */ } #ifdef PULLDOWN_DEBUG { struct mbuf *t; printf("before:"); for (t = m; t; t = t->m_next) printf(" %d", t->m_len); printf("\n"); } #endif n = m; while (n != NULL && off > 0) { if (n->m_len > off) break; off -= n->m_len; n = n->m_next; } /* be sure to point non-empty mbuf */ while (n != NULL && n->m_len == 0) n = n->m_next; if (!n) { m_freem(m); return NULL; /* mbuf chain too short */ } /* * XXX: This code is flawed because it considers a "writable" mbuf * data region to require all of the following: * (i) mbuf _has_ to have M_EXT set; if it is just a regular * mbuf, it is still not considered "writable." * (ii) since mbuf has M_EXT, the ext_type _has_ to be * EXT_CLUSTER. Anything else makes it non-writable. * (iii) M_WRITABLE() must evaluate true. * Ideally, the requirement should only be (iii). * * If we're writable, we're sure we're writable, because the ref. count * cannot increase from 1, as that would require posession of mbuf * n by someone else (which is impossible). However, if we're _not_ * writable, we may eventually become writable )if the ref. count drops * to 1), but we'll fail to notice it unless we re-evaluate * M_WRITABLE(). For now, we only evaluate once at the beginning and * live with this. */ /* * XXX: This is dumb. If we're just a regular mbuf with no M_EXT, * then we're not "writable," according to this code. */ writable = 0; if ((n->m_flags & M_EXT) == 0 || (n->m_ext.ext_type == EXT_CLUSTER && M_WRITABLE(n))) writable = 1; /* * the target data is on . * if we got enough data on the mbuf "n", we're done. */ if ((off == 0 || offp) && len <= n->m_len - off && writable) goto ok; /* * when len <= n->m_len - off and off != 0, it is a special case. * len bytes from sits in single mbuf, but the caller does * not like the starting position (off). * chop the current mbuf into two pieces, set off to 0. */ if (len <= n->m_len - off) { o = m_dup1(n, off, n->m_len - off, M_DONTWAIT); if (o == NULL) { m_freem(m); return NULL; /* ENOBUFS */ } n->m_len = off; o->m_next = n->m_next; n->m_next = o; n = n->m_next; off = 0; goto ok; } /* * we need to take hlen from and tlen from m_next, 0>, * and construct contiguous mbuf with m_len == len. * note that hlen + tlen == len, and tlen > 0. */ hlen = n->m_len - off; tlen = len - hlen; /* * ensure that we have enough trailing data on mbuf chain. * if not, we can do nothing about the chain. */ olen = 0; for (o = n->m_next; o != NULL; o = o->m_next) olen += o->m_len; if (hlen + olen < len) { m_freem(m); return NULL; /* mbuf chain too short */ } /* * easy cases first. * we need to use m_copydata() to get data from m_next, 0>. */ if ((off == 0 || offp) && M_TRAILINGSPACE(n) >= tlen && writable) { m_copydata(n->m_next, 0, tlen, mtod(n, caddr_t) + n->m_len); n->m_len += tlen; m_adj(n->m_next, tlen); goto ok; } if ((off == 0 || offp) && M_LEADINGSPACE(n->m_next) >= hlen && writable) { n->m_next->m_data -= hlen; n->m_next->m_len += hlen; bcopy(mtod(n, caddr_t) + off, mtod(n->m_next, caddr_t), hlen); n->m_len -= hlen; n = n->m_next; off = 0; goto ok; } /* * now, we need to do the hard way. don't m_copy as there's no room * on both end. */ MGET(o, M_DONTWAIT, m->m_type); if (o && len > MLEN) { MCLGET(o, M_DONTWAIT); if ((o->m_flags & M_EXT) == 0) { m_free(o); o = NULL; } } if (!o) { m_freem(m); return NULL; /* ENOBUFS */ } /* get hlen from into */ o->m_len = hlen; bcopy(mtod(n, caddr_t) + off, mtod(o, caddr_t), hlen); n->m_len -= hlen; /* get tlen from m_next, 0> into */ m_copydata(n->m_next, 0, tlen, mtod(o, caddr_t) + o->m_len); o->m_len += tlen; m_adj(n->m_next, tlen); o->m_next = n->m_next; n->m_next = o; n = o; off = 0; ok: #ifdef PULLDOWN_DEBUG { struct mbuf *t; printf("after:"); for (t = m; t; t = t->m_next) printf("%c%d", t == n ? '*' : ' ', t->m_len); printf(" (off=%d)\n", off); } #endif if (offp) *offp = off; return n; } static struct mbuf * m_dup1(struct mbuf *m, int off, int len, int wait) { struct mbuf *n; int l; int copyhdr; if (len > MCLBYTES) return NULL; if (off == 0 && (m->m_flags & M_PKTHDR) != 0) { copyhdr = 1; MGETHDR(n, wait, m->m_type); l = MHLEN; } else { copyhdr = 0; MGET(n, wait, m->m_type); l = MLEN; } if (n && len > l) { MCLGET(n, wait); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (!n) return NULL; if (copyhdr) M_COPY_PKTHDR(n, m); m_copydata(m, off, len, mtod(n, caddr_t)); return n; } -/* - * pkthdr.aux chain manipulation. - * we don't allow clusters at this moment. - */ -struct mbuf * -m_aux_add2(struct mbuf *m, int af, int type, void *p) +/* Get a packet tag structure along with specified data following. */ +struct m_tag * +m_tag_alloc(u_int32_t cookie, int type, int len, int wait) { - struct mbuf *n; - struct mauxtag *t; + struct m_tag *t; - if ((m->m_flags & M_PKTHDR) == 0) + if (len < 0) return NULL; + t = malloc(len + sizeof(struct m_tag), M_PACKET_TAGS, wait); + if (t == NULL) + return NULL; + t->m_tag_id = type; + t->m_tag_len = len; + t->m_tag_cookie = cookie; + return t; +} - n = m_aux_find(m, af, type); - if (n) - return n; +/* Free a packet tag. */ +void +m_tag_free(struct m_tag *t) +{ + free(t, M_PACKET_TAGS); +} - MGET(n, M_DONTWAIT, m->m_type); - if (n == NULL) - return NULL; +/* Prepend a packet tag. */ +void +m_tag_prepend(struct mbuf *m, struct m_tag *t) +{ + KASSERT(m && t, ("m_tag_prepend: null argument, m %p t %p", m, t)); + SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); +} - t = mtod(n, struct mauxtag *); - bzero(t, sizeof(*t)); - t->af = af; - t->type = type; - t->p = p; - n->m_data += sizeof(struct mauxtag); - n->m_len = 0; - n->m_next = m->m_pkthdr.aux; - m->m_pkthdr.aux = n; - return n; +/* Unlink a packet tag. */ +void +m_tag_unlink(struct mbuf *m, struct m_tag *t) +{ + KASSERT(m && t, ("m_tag_unlink: null argument, m %p t %p", m, t)); + SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); } -struct mbuf * -m_aux_find2(struct mbuf *m, int af, int type, void *p) +/* Unlink and free a packet tag. */ +void +m_tag_delete(struct mbuf *m, struct m_tag *t) { - struct mbuf *n; - struct mauxtag *t; + KASSERT(m && t, ("m_tag_delete: null argument, m %p t %p", m, t)); + m_tag_unlink(m, t); + m_tag_free(t); +} - if ((m->m_flags & M_PKTHDR) == 0) - return NULL; +/* Unlink and free a packet tag chain, starting from given tag. */ +void +m_tag_delete_chain(struct mbuf *m, struct m_tag *t) +{ + struct m_tag *p, *q; - for (n = m->m_pkthdr.aux; n; n = n->m_next) { - t = (struct mauxtag *)n->m_dat; - if (n->m_data != ((caddr_t)t) + sizeof(struct mauxtag)) { - printf("m_aux_find: invalid m_data for mbuf=%p (%p %p)\n", n, t, n->m_data); - continue; - } - if (t->af == af && t->type == type && t->p == p) - return n; + KASSERT(m, ("m_tag_delete_chain: null mbuf")); + if (t != NULL) + p = t; + else + p = SLIST_FIRST(&m->m_pkthdr.tags); + if (p == NULL) + return; + while ((q = SLIST_NEXT(p, m_tag_link)) != NULL) + m_tag_delete(m, q); + m_tag_delete(m, p); +} + +/* Find a tag, starting from a given position. */ +struct m_tag * +m_tag_locate(struct mbuf *m, u_int32_t cookie, int type, struct m_tag *t) +{ + struct m_tag *p; + + KASSERT(m, ("m_tag_find: null mbuf")); + if (t == NULL) + p = SLIST_FIRST(&m->m_pkthdr.tags); + else + p = SLIST_NEXT(t, m_tag_link); + while (p != NULL) { + if (p->m_tag_cookie == cookie && p->m_tag_id == type) + return p; + p = SLIST_NEXT(p, m_tag_link); } return NULL; } -struct mbuf * -m_aux_find(struct mbuf *m, int af, int type) +/* Copy a single tag. */ +struct m_tag * +m_tag_copy(struct m_tag *t) { + struct m_tag *p; - return m_aux_find2(m, af, type, NULL); + KASSERT(t, ("m_tag_copy: null tag")); + p = m_tag_alloc(t->m_tag_cookie, t->m_tag_id, t->m_tag_len, M_NOWAIT); + if (p == NULL) + return (NULL); + bcopy(t + 1, p + 1, t->m_tag_len); /* Copy the data */ + return p; } -struct mbuf * -m_aux_add(struct mbuf *m, int af, int type) +/* + * Copy two tag chains. The destination mbuf (to) loses any attached + * tags even if the operation fails. This should not be a problem, as + * m_tag_copy_chain() is typically called with a newly-allocated + * destination mbuf. + */ +int +m_tag_copy_chain(struct mbuf *to, struct mbuf *from) { + struct m_tag *p, *t, *tprev = NULL; - return m_aux_add2(m, af, type, NULL); + KASSERT(to && from, + ("m_tag_copy: null argument, to %p from %p", to, from)); + m_tag_delete_chain(to, NULL); + SLIST_FOREACH(p, &from->m_pkthdr.tags, m_tag_link) { + t = m_tag_copy(p); + if (t == NULL) { + m_tag_delete_chain(to, NULL); + return 0; + } + if (tprev == NULL) + SLIST_INSERT_HEAD(&to->m_pkthdr.tags, t, m_tag_link); + else { + SLIST_INSERT_AFTER(tprev, t, m_tag_link); + tprev = t; + } + } + return 1; } +/* Initialize tags on an mbuf. */ void -m_aux_delete(struct mbuf *m, struct mbuf *victim) +m_tag_init(struct mbuf *m) { - struct mbuf *n, *prev, *next; - struct mauxtag *t; + SLIST_INIT(&m->m_pkthdr.tags); +} - if ((m->m_flags & M_PKTHDR) == 0) - return; +/* Get first tag in chain. */ +struct m_tag * +m_tag_first(struct mbuf *m) +{ + return SLIST_FIRST(&m->m_pkthdr.tags); +} - prev = NULL; - n = m->m_pkthdr.aux; - while (n) { - t = (struct mauxtag *)n->m_dat; - next = n->m_next; - if (n->m_data != ((caddr_t)t) + sizeof(struct mauxtag)) { - printf("m_aux_delete: invalid m_data for mbuf=%p (%p %p)\n", n, t, n->m_data); - prev = n; - n = next; - continue; - } - if (n == victim) { - if (prev) - prev->m_next = n->m_next; - else - m->m_pkthdr.aux = n->m_next; - n->m_next = NULL; - m_free(n); - return; - } else - prev = n; - n = next; - } +/* Get next tag in chain. */ +struct m_tag * +m_tag_next(struct mbuf *m, struct m_tag *t) +{ + return SLIST_NEXT(t, m_tag_link); } Index: head/sys/net/bridge.c =================================================================== --- head/sys/net/bridge.c (revision 105193) +++ head/sys/net/bridge.c (revision 105194) @@ -1,1143 +1,1143 @@ /* * Copyright (c) 1998-2002 Luigi Rizzo * * Work partly supported by: Cisco Systems, Inc. - NSITE lab, RTP, NC * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * This code implements bridging in FreeBSD. It only acts on ethernet * interfaces, including VLANs (others are still usable for routing). * A FreeBSD host can implement multiple logical bridges, called * "clusters". Each cluster is made of a set of interfaces, and * identified by a "cluster-id" which is a number in the range 1..2^16-1. * * Bridging is enabled by the sysctl variable * net.link.ether.bridge * the grouping of interfaces into clusters is done with * net.link.ether.bridge_cfg * containing a list of interfaces each optionally followed by * a colon and the cluster it belongs to (1 is the default). * Separators can be * spaces, commas or tabs, e.g. * net.link.ether.bridge_cfg="fxp0:2 fxp1:2 dc0 dc1:1" * Optionally bridged packets can be passed through the firewall, * this is controlled by the variable * net.link.ether.bridge_ipfw * * For each cluster there is a descriptor (cluster_softc) storing * the following data structures: * - a hash table with the MAC address and destination interface for each * known node. The table is indexed using a hash of the source address. * - an array with the MAC addresses of the interfaces used in the cluster. * * Input packets are tapped near the beginning of ether_input(), and * analysed by bridge_in(). Depending on the result, the packet * can be forwarded to one or more output interfaces using bdg_forward(), * and/or sent to the upper layer (e.g. in case of multicast). * * Output packets are intercepted near the end of ether_output(). * The correct destination is selected by bridge_dst_lookup(), * and then forwarding is done by bdg_forward(). * * The arp code is also modified to let a machine answer to requests * irrespective of the port the request came from. * * In case of loops in the bridging topology, the bridge detects this * event and temporarily mutes output bridging on one of the ports. * Periodically, interfaces are unmuted by bdg_timeout(). * Muting is only implemented as a safety measure, and also as * a mechanism to support a user-space implementation of the spanning * tree algorithm. * * To build a bridging kernel, use the following option * option BRIDGE * and then at runtime set the sysctl variable to enable bridging. * * Only one interface per cluster is supposed to have addresses set (but * there are no substantial problems if you set addresses for none or * for more than one interface). * Bridging will act before routing, but nothing prevents a machine * from doing both (modulo bugs in the implementation...). * * THINGS TO REMEMBER * - bridging is incompatible with multicast routing on the same * machine. There is not an easy fix to this. * - be very careful when bridging VLANs * - loop detection is still not very robust. */ #include #include #include #include #include #include /* for net/if.h */ #include /* string functions */ #include #include #include /* for ipfilter */ #include #include #include #include /* for struct arpcom */ #include #include #include #include /* for struct arpcom */ #include #include #include #include /*--------------------*/ /* * For each cluster, source MAC addresses are stored into a hash * table which locates the port they reside on. */ #define HASH_SIZE 8192 /* Table size, must be a power of 2 */ typedef struct hash_table { /* each entry. */ struct ifnet * name; u_char etheraddr[6]; u_int16_t used; /* also, padding */ } bdg_hash_table ; /* * The hash function applied to MAC addresses. Out of the 6 bytes, * the last ones tend to vary more. Since we are on a little endian machine, * we have to do some gimmick... */ #define HASH_FN(addr) ( \ ntohs( ((u_int16_t *)addr)[1] ^ ((u_int16_t *)addr)[2] ) & (HASH_SIZE -1)) /* * This is the data structure where local addresses are stored. */ struct bdg_addr { u_char etheraddr[6] ; u_int16_t _padding ; }; /* * The configuration of each cluster includes the cluster id, a pointer to * the hash table, and an array of local MAC addresses (of size "ports"). */ struct cluster_softc { u_int16_t cluster_id; u_int16_t ports; bdg_hash_table *ht; struct bdg_addr *my_macs; /* local MAC addresses */ }; extern struct protosw inetsw[]; /* from netinet/ip_input.c */ extern u_char ip_protox[]; /* from netinet/ip_input.c */ static int n_clusters; /* number of clusters */ static struct cluster_softc *clusters; #define BDG_MUTED(ifp) (ifp2sc[ifp->if_index].flags & IFF_MUTE) #define BDG_MUTE(ifp) ifp2sc[ifp->if_index].flags |= IFF_MUTE #define BDG_CLUSTER(ifp) (ifp2sc[ifp->if_index].cluster) #define BDG_SAMECLUSTER(ifp,src) \ (src == NULL || BDG_CLUSTER(ifp) == BDG_CLUSTER(src) ) #ifdef __i386__ #define BDG_MATCH(a,b) ( \ ((u_int16_t *)(a))[2] == ((u_int16_t *)(b))[2] && \ *((u_int32_t *)(a)) == *((u_int32_t *)(b)) ) #define IS_ETHER_BROADCAST(a) ( \ *((u_int32_t *)(a)) == 0xffffffff && \ ((u_int16_t *)(a))[2] == 0xffff ) #else /* for machines that do not support unaligned access */ #define BDG_MATCH(a,b) (!bcmp(a, b, ETHER_ADDR_LEN) ) #define IS_ETHER_BROADCAST(a) (!bcmp(a, "\377\377\377\377\377\377", 6)) #endif /* * For timing-related debugging, you can use the following macros. * remember, rdtsc() only works on Pentium-class machines quad_t ticks; DDB(ticks = rdtsc();) ... interesting code ... DDB(bdg_fw_ticks += (u_long)(rdtsc() - ticks) ; bdg_fw_count++ ;) * */ #define DDB(x) x #define DEB(x) static int bdginit(void); static void parse_bdg_cfg(void); static int bdg_ipf; /* IPFilter enabled in bridge */ static int bdg_ipfw; #if 0 /* debugging only */ static char *bdg_dst_names[] = { "BDG_NULL ", "BDG_BCAST ", "BDG_MCAST ", "BDG_LOCAL ", "BDG_DROP ", "BDG_UNKNOWN ", "BDG_IN ", "BDG_OUT ", "BDG_FORWARD " }; #endif /* * System initialization */ static struct bdg_stats bdg_stats ; static struct callout_handle bdg_timeout_h ; /* * Add an interface to a cluster, possibly creating a new entry in * the cluster table. This requires reallocation of the table and * updating pointers in ifp2sc. */ static struct cluster_softc * add_cluster(u_int16_t cluster_id, struct arpcom *ac) { struct cluster_softc *c = NULL; int i; for (i = 0; i < n_clusters ; i++) if (clusters[i].cluster_id == cluster_id) goto found; /* Not found, need to reallocate */ c = malloc((1+n_clusters) * sizeof (*c), M_IFADDR, M_DONTWAIT | M_ZERO); if (c == NULL) {/* malloc failure */ printf("-- bridge: cannot add new cluster\n"); return NULL; } c[n_clusters].ht = (struct hash_table *) malloc(HASH_SIZE * sizeof(struct hash_table), M_IFADDR, M_WAITOK | M_ZERO); if (c[n_clusters].ht == NULL) { printf("-- bridge: cannot allocate hash table for new cluster\n"); free(c, M_IFADDR); return NULL; } c[n_clusters].my_macs = (struct bdg_addr *) malloc(BDG_MAX_PORTS * sizeof(struct bdg_addr), M_IFADDR, M_WAITOK | M_ZERO); if (c[n_clusters].my_macs == NULL) { printf("-- bridge: cannot allocate mac addr table for new cluster\n"); free(c[n_clusters].ht, M_IFADDR); free(c, M_IFADDR); return NULL; } c[n_clusters].cluster_id = cluster_id; c[n_clusters].ports = 0; /* * now copy old descriptors here */ if (n_clusters > 0) { for (i=0; i < n_clusters; i++) c[i] = clusters[i]; /* * and finally update pointers in ifp2sc */ for (i = 0 ; i < if_index && i < BDG_MAX_PORTS; i++) if (ifp2sc[i].cluster != NULL) ifp2sc[i].cluster = c + (ifp2sc[i].cluster - clusters); free(clusters, M_IFADDR); } clusters = c; i = n_clusters; /* index of cluster entry */ n_clusters++; found: c = clusters + i; /* the right cluster ... */ bcopy(ac->ac_enaddr, &(c->my_macs[c->ports]), 6); c->ports++; return c; } /* * Turn off bridging, by clearing promisc mode on the interface, * marking the interface as unused, and clearing the name in the * stats entry. * Also dispose the hash tables associated with the clusters. */ static void bridge_off(void) { struct ifnet *ifp ; int i, s; DEB(printf("bridge_off: n_clusters %d\n", n_clusters);) TAILQ_FOREACH(ifp, &ifnet, if_link) { struct bdg_softc *b; if (ifp->if_index >= BDG_MAX_PORTS) continue; /* make sure we do not go beyond the end */ b = &(ifp2sc[ifp->if_index]); if ( b->flags & IFF_BDG_PROMISC ) { s = splimp(); ifpromisc(ifp, 0); splx(s); b->flags &= ~(IFF_BDG_PROMISC|IFF_MUTE) ; DEB(printf(">> now %s%d promisc OFF if_flags 0x%x bdg_flags 0x%x\n", ifp->if_name, ifp->if_unit, ifp->if_flags, b->flags);) } b->flags &= ~(IFF_USED) ; b->cluster = NULL; bdg_stats.s[ifp->if_index].name[0] = '\0'; } /* flush_tables */ s = splimp(); for (i=0; i < n_clusters; i++) { free(clusters[i].ht, M_IFADDR); free(clusters[i].my_macs, M_IFADDR); } if (clusters != NULL) free(clusters, M_IFADDR); clusters = NULL; n_clusters =0; splx(s); } /* * set promisc mode on the interfaces we use. */ static void bridge_on(void) { struct ifnet *ifp ; int s ; TAILQ_FOREACH(ifp, &ifnet, if_link) { struct bdg_softc *b = &ifp2sc[ifp->if_index]; if ( !(b->flags & IFF_USED) ) continue ; if ( !( ifp->if_flags & IFF_UP) ) { s = splimp(); if_up(ifp); splx(s); } if ( !(b->flags & IFF_BDG_PROMISC) ) { int ret ; s = splimp(); ret = ifpromisc(ifp, 1); splx(s); b->flags |= IFF_BDG_PROMISC ; DEB(printf(">> now %s%d promisc ON if_flags 0x%x bdg_flags 0x%x\n", ifp->if_name, ifp->if_unit, ifp->if_flags, b->flags);) } if (b->flags & IFF_MUTE) { DEB(printf(">> unmuting %s%d\n", ifp->if_name, ifp->if_unit);) b->flags &= ~IFF_MUTE; } } } /** * reconfigure bridge. * This is also done every time we attach or detach an interface. * Main use is to make sure that we do not bridge on some old * (ejected) device. So, it would be really useful to have a * pointer to the modified device as an argument. Without it, we * have to scan all interfaces. */ static void reconfigure_bridge(void) { bridge_off(); if (do_bridge) { if (if_index >= BDG_MAX_PORTS) { printf("-- sorry too many interfaces (%d, max is %d)," " disabling bridging\n", if_index, BDG_MAX_PORTS); do_bridge=0; return; } parse_bdg_cfg(); bridge_on(); } } static char bridge_cfg[1024]; /* in BSS so initialized to all NULs */ /* * parse the config string, set IFF_USED, name and cluster_id * for all interfaces found. * The config string is a list of "if[:cluster]" with * a number of possible separators (see "sep"). In particular the * use of the space lets you set bridge_cfg with the output from * "ifconfig -l" */ static void parse_bdg_cfg() { char *p, *beg ; int l, cluster; static char *sep = ", \t"; for (p = bridge_cfg; *p ; p++) { struct ifnet *ifp; int found = 0; char c; if (index(sep, *p)) /* skip separators */ continue ; /* names are lowercase and digits */ for ( beg = p ; islower(*p) || isdigit(*p) ; p++ ) ; l = p - beg ; /* length of name string */ if (l == 0) /* invalid name */ break ; if ( *p != ':' ) /* no ':', assume default cluster 1 */ cluster = 1 ; else /* fetch cluster */ cluster = strtoul( p+1, &p, 10); c = *p; *p = '\0'; /* * now search in interface list for a matching name */ TAILQ_FOREACH(ifp, &ifnet, if_link) { char buf[IFNAMSIZ]; snprintf(buf, sizeof(buf), "%s%d", ifp->if_name, ifp->if_unit); if (!strncmp(beg, buf, max(l, strlen(buf)))) { struct bdg_softc *b = &ifp2sc[ifp->if_index]; if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN) { printf("%s is not an ethernet, continue\n", buf); continue; } if (b->flags & IFF_USED) { printf("%s already used, skipping\n", buf); break; } b->cluster = add_cluster(htons(cluster), (struct arpcom *)ifp); b->flags |= IFF_USED ; sprintf(bdg_stats.s[ifp->if_index].name, "%s%d:%d", ifp->if_name, ifp->if_unit, cluster); DEB(printf("--++ found %s next c %d\n", bdg_stats.s[ifp->if_index].name, c);) found = 1; break ; } } if (!found) printf("interface %s Not found in bridge\n", beg); *p = c; if (c == '\0') break; /* no more */ } } /* * handler for net.link.ether.bridge */ static int sysctl_bdg(SYSCTL_HANDLER_ARGS) { int error, oldval = do_bridge ; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); DEB( printf("called sysctl for bridge name %s arg2 %d val %d->%d\n", oidp->oid_name, oidp->oid_arg2, oldval, do_bridge); ) if (oldval != do_bridge) reconfigure_bridge(); return error ; } /* * handler for net.link.ether.bridge_cfg */ static int sysctl_bdg_cfg(SYSCTL_HANDLER_ARGS) { int error = 0 ; char old_cfg[1024] ; strcpy(old_cfg, bridge_cfg) ; error = sysctl_handle_string(oidp, bridge_cfg, oidp->oid_arg2, req); DEB( printf("called sysctl for bridge name %s arg2 %d err %d val %s->%s\n", oidp->oid_name, oidp->oid_arg2, error, old_cfg, bridge_cfg); ) if (strcmp(old_cfg, bridge_cfg)) reconfigure_bridge(); return error ; } static int sysctl_refresh(SYSCTL_HANDLER_ARGS) { if (req->newptr) reconfigure_bridge(); return 0; } SYSCTL_DECL(_net_link_ether); SYSCTL_PROC(_net_link_ether, OID_AUTO, bridge_cfg, CTLTYPE_STRING|CTLFLAG_RW, &bridge_cfg, sizeof(bridge_cfg), &sysctl_bdg_cfg, "A", "Bridge configuration"); SYSCTL_PROC(_net_link_ether, OID_AUTO, bridge, CTLTYPE_INT|CTLFLAG_RW, &do_bridge, 0, &sysctl_bdg, "I", "Bridging"); SYSCTL_INT(_net_link_ether, OID_AUTO, bridge_ipfw, CTLFLAG_RW, &bdg_ipfw,0,"Pass bridged pkts through firewall"); SYSCTL_INT(_net_link_ether, OID_AUTO, bridge_ipf, CTLFLAG_RW, &bdg_ipf, 0,"Pass bridged pkts through IPFilter"); /* * The follow macro declares a variable, and maps it to * a SYSCTL_INT entry with the same name. */ #define SY(parent, var, comment) \ static int var ; \ SYSCTL_INT(parent, OID_AUTO, var, CTLFLAG_RW, &(var), 0, comment); int bdg_ipfw_drops; SYSCTL_INT(_net_link_ether, OID_AUTO, bridge_ipfw_drop, CTLFLAG_RW, &bdg_ipfw_drops,0,""); int bdg_ipfw_colls; SYSCTL_INT(_net_link_ether, OID_AUTO, bridge_ipfw_collisions, CTLFLAG_RW, &bdg_ipfw_colls,0,""); SYSCTL_PROC(_net_link_ether, OID_AUTO, bridge_refresh, CTLTYPE_INT|CTLFLAG_WR, NULL, 0, &sysctl_refresh, "I", "iface refresh"); #if 1 /* diagnostic vars */ SY(_net_link_ether, verbose, "Be verbose"); SY(_net_link_ether, bdg_split_pkts, "Packets split in bdg_forward"); SY(_net_link_ether, bdg_thru, "Packets through bridge"); SY(_net_link_ether, bdg_copied, "Packets copied in bdg_forward"); SY(_net_link_ether, bdg_copy, "Force copy in bdg_forward"); SY(_net_link_ether, bdg_predict, "Correctly predicted header location"); SY(_net_link_ether, bdg_fw_avg, "Cycle counter avg"); SY(_net_link_ether, bdg_fw_ticks, "Cycle counter item"); SY(_net_link_ether, bdg_fw_count, "Cycle counter count"); #endif SYSCTL_STRUCT(_net_link_ether, PF_BDG, bdgstats, CTLFLAG_RD, &bdg_stats , bdg_stats, "bridge statistics"); static int bdg_loops ; /* * called periodically to flush entries etc. */ static void bdg_timeout(void *dummy) { static int slowtimer; /* in BSS so initialized to 0 */ if (do_bridge) { static int age_index = 0 ; /* index of table position to age */ int l = age_index + HASH_SIZE/4 ; int i; /* * age entries in the forwarding table. */ if (l > HASH_SIZE) l = HASH_SIZE ; for (i=0; i= HASH_SIZE) age_index = 0 ; if (--slowtimer <= 0 ) { slowtimer = 5 ; bridge_on() ; /* we just need unmute, really */ bdg_loops = 0 ; } } bdg_timeout_h = timeout(bdg_timeout, NULL, 2*hz ); } /* * Find the right pkt destination: * BDG_BCAST is a broadcast * BDG_MCAST is a multicast * BDG_LOCAL is for a local address * BDG_DROP must be dropped * other ifp of the dest. interface (incl.self) * * We assume this is only called for interfaces for which bridging * is enabled, i.e. BDG_USED(ifp) is true. */ static __inline struct ifnet * bridge_dst_lookup(struct ether_header *eh, struct cluster_softc *c) { struct ifnet *dst ; int index ; struct bdg_addr *p ; bdg_hash_table *bt; /* pointer to entry in hash table */ if (IS_ETHER_BROADCAST(eh->ether_dhost)) return BDG_BCAST ; if (eh->ether_dhost[0] & 1) return BDG_MCAST ; /* * Lookup local addresses in case one matches. */ for (index = c->ports, p = c->my_macs; index ; index--, p++ ) if (BDG_MATCH(p->etheraddr, eh->ether_dhost) ) return BDG_LOCAL ; /* * Look for a possible destination in table */ index= HASH_FN( eh->ether_dhost ); bt = &(c->ht[index]); dst = bt->name; if ( dst && BDG_MATCH( bt->etheraddr, eh->ether_dhost) ) return dst ; else return BDG_UNKNOWN ; } /** * bridge_in() is invoked to perform bridging decision on input packets. * * On Input: * eh Ethernet header of the incoming packet. * ifp interface the packet is coming from. * * On Return: destination of packet, one of * BDG_BCAST broadcast * BDG_MCAST multicast * BDG_LOCAL is only for a local address (do not forward) * BDG_DROP drop the packet * ifp ifp of the destination interface. * * Forwarding is not done directly to give a chance to some drivers * to fetch more of the packet, or simply drop it completely. */ static struct ifnet * bridge_in(struct ifnet *ifp, struct ether_header *eh) { int index; struct ifnet *dst , *old ; bdg_hash_table *bt; /* location in hash table */ int dropit = BDG_MUTED(ifp) ; /* * hash the source address */ index= HASH_FN(eh->ether_shost); bt = &(ifp2sc[ifp->if_index].cluster->ht[index]); bt->used = 1 ; old = bt->name ; if ( old ) { /* the entry is valid. */ if (!BDG_MATCH( eh->ether_shost, bt->etheraddr) ) { bdg_ipfw_colls++ ; bt->name = NULL ; } else if (old != ifp) { /* * Found a loop. Either a machine has moved, or there * is a misconfiguration/reconfiguration of the network. * First, do not forward this packet! * Record the relocation anyways; then, if loops persist, * suspect a reconfiguration and disable forwarding * from the old interface. */ bt->name = ifp ; /* relocate address */ printf("-- loop (%d) %6D to %s%d from %s%d (%s)\n", bdg_loops, eh->ether_shost, ".", ifp->if_name, ifp->if_unit, old->if_name, old->if_unit, BDG_MUTED(old) ? "muted":"active"); dropit = 1 ; if ( !BDG_MUTED(old) ) { if (++bdg_loops > 10) BDG_MUTE(old) ; } } } /* * now write the source address into the table */ if (bt->name == NULL) { DEB(printf("new addr %6D at %d for %s%d\n", eh->ether_shost, ".", index, ifp->if_name, ifp->if_unit);) bcopy(eh->ether_shost, bt->etheraddr, 6); bt->name = ifp ; } dst = bridge_dst_lookup(eh, ifp2sc[ifp->if_index].cluster); /* * bridge_dst_lookup can return the following values: * BDG_BCAST, BDG_MCAST, BDG_LOCAL, BDG_UNKNOWN, BDG_DROP, ifp. * For muted interfaces, or when we detect a loop, the first 3 are * changed in BDG_LOCAL (we still listen to incoming traffic), * and others to BDG_DROP (no use for the local host). * Also, for incoming packets, ifp is changed to BDG_DROP if ifp == src. * These changes are not necessary for outgoing packets from ether_output(). */ BDG_STAT(ifp, BDG_IN); switch ((uintptr_t)dst) { case (uintptr_t)BDG_BCAST: case (uintptr_t)BDG_MCAST: case (uintptr_t)BDG_LOCAL: case (uintptr_t)BDG_UNKNOWN: case (uintptr_t)BDG_DROP: BDG_STAT(ifp, dst); break ; default : if (dst == ifp || dropit) BDG_STAT(ifp, BDG_DROP); else BDG_STAT(ifp, BDG_FORWARD); break ; } if ( dropit ) { if (dst == BDG_BCAST || dst == BDG_MCAST || dst == BDG_LOCAL) dst = BDG_LOCAL ; else dst = BDG_DROP ; } else { if (dst == ifp) dst = BDG_DROP; } DEB(printf("bridge_in %6D ->%6D ty 0x%04x dst %s%d\n", eh->ether_shost, ".", eh->ether_dhost, ".", ntohs(eh->ether_type), (dst <= BDG_FORWARD) ? bdg_dst_names[(int)dst] : dst->if_name, (dst <= BDG_FORWARD) ? 0 : dst->if_unit); ) return dst ; } /* * Forward a packet to dst -- which can be a single interface or * an entire cluster. The src port and muted interfaces are excluded. * * If src == NULL, the pkt comes from ether_output, and dst is the real * interface the packet is originally sent to. In this case, we must forward * it to the whole cluster. * We never call bdg_forward from ether_output on interfaces which are * not part of a cluster. * * If possible (i.e. we can determine that the caller does not need * a copy), the packet is consumed here, and bdg_forward returns NULL. * Otherwise, a pointer to a copy of the packet is returned. * * XXX be careful with eh, it can be a pointer into *m */ static struct mbuf * bdg_forward(struct mbuf *m0, struct ether_header *const eh, struct ifnet *dst) { struct ifnet *src; struct ifnet *ifp, *last; int shared = bdg_copy ; /* someone else is using the mbuf */ int once = 0; /* loop only once */ struct ifnet *real_dst = dst ; /* real dst from ether_output */ struct ip_fw_args args; #ifdef PFIL_HOOKS struct packet_filter_hook *pfh; int rv; #endif /* PFIL_HOOKS */ /* * XXX eh is usually a pointer within the mbuf (some ethernet drivers * do that), so we better copy it before doing anything with the mbuf, * or we might corrupt the header. */ struct ether_header save_eh = *eh ; DEB(quad_t ticks; ticks = rdtsc();) args.rule = NULL; /* did we match a firewall rule ? */ /* Fetch state from dummynet tag, ignore others */ for (;m0->m_type == MT_TAG; m0 = m0->m_next) - if (m0->m_tag_id == PACKET_TAG_DUMMYNET) { + if (m0->_m_tag_id == PACKET_TAG_DUMMYNET) { args.rule = ((struct dn_pkt *)m0)->rule; shared = 0; /* For sure this is our own mbuf. */ } if (args.rule == NULL) bdg_thru++; /* first time through bdg_forward, count packet */ src = m0->m_pkthdr.rcvif; if (src == NULL) /* packet from ether_output */ dst = bridge_dst_lookup(eh, ifp2sc[real_dst->if_index].cluster); if (dst == BDG_DROP) { /* this should not happen */ printf("xx bdg_forward for BDG_DROP\n"); m_freem(m0); return NULL; } if (dst == BDG_LOCAL) { /* this should not happen as well */ printf("xx ouch, bdg_forward for local pkt\n"); return m0; } if (dst == BDG_BCAST || dst == BDG_MCAST || dst == BDG_UNKNOWN) { ifp = TAILQ_FIRST(&ifnet) ; /* scan all ports */ once = 0 ; if (dst != BDG_UNKNOWN) /* need a copy for the local stack */ shared = 1 ; } else { ifp = dst ; once = 1 ; } if ((uintptr_t)(ifp) <= (u_int)BDG_FORWARD) panic("bdg_forward: bad dst"); /* * Do filtering in a very similar way to what is done in ip_output. * Only if firewall is loaded, enabled, and the packet is not * from ether_output() (src==NULL, or we would filter it twice). * Additional restrictions may apply e.g. non-IP, short packets, * and pkts already gone through a pipe. */ if (src != NULL && ( #ifdef PFIL_HOOKS ((pfh = pfil_hook_get(PFIL_IN, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh)) != NULL && bdg_ipf !=0) || #endif (IPFW_LOADED && bdg_ipfw != 0))) { int i; if (args.rule != NULL && fw_one_pass) goto forward; /* packet already partially processed */ /* * i need some amt of data to be contiguous, and in case others need * the packet (shared==1) also better be in the first mbuf. */ i = min(m0->m_pkthdr.len, max_protohdr) ; if ( shared || m0->m_len < i) { m0 = m_pullup(m0, i) ; if (m0 == NULL) { printf("-- bdg: pullup failed.\n") ; return NULL ; } } #ifdef PFIL_HOOKS /* * NetBSD-style generic packet filter, pfil(9), hooks. * Enables ipf(8) in bridging. */ if (m0->m_pkthdr.len >= sizeof(struct ip) && ntohs(save_eh.ether_type) == ETHERTYPE_IP) { /* * before calling the firewall, swap fields the same as IP does. * here we assume the pkt is an IP one and the header is contiguous */ struct ip *ip = mtod(m0, struct ip *); ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); for (; pfh; pfh = TAILQ_NEXT(pfh, pfil_link)) if (pfh->pfil_func) { rv = pfh->pfil_func(ip, ip->ip_hl << 2, src, 0, &m0); if (rv != 0 || m0 == NULL) return m0; ip = mtod(m0, struct ip *); } /* * If we get here, the firewall has passed the pkt, but the mbuf * pointer might have changed. Restore ip and the fields ntohs()'d. */ ip = mtod(m0, struct ip *); ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); } #endif /* PFIL_HOOKS */ /* * Prepare arguments and call the firewall. */ if (!IPFW_LOADED || bdg_ipfw == 0) goto forward; /* not using ipfw, accept the packet */ /* * XXX The following code is very similar to the one in * if_ethersubr.c:ether_ipfw_chk() */ args.m = m0; /* the packet we are looking at */ args.oif = NULL; /* this is an input packet */ args.divert_rule = 0; /* we do not support divert yet */ args.next_hop = NULL; /* we do not support forward yet */ args.eh = &save_eh; /* MAC header for bridged/MAC packets */ i = ip_fw_chk_ptr(&args); m0 = args.m; /* in case the firewall used the mbuf */ if ( (i & IP_FW_PORT_DENY_FLAG) || m0 == NULL) /* drop */ return m0 ; if (i == 0) /* a PASS rule. */ goto forward ; if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG)) { /* * Pass the pkt to dummynet, which consumes it. * If shared, make a copy and keep the original. */ struct mbuf *m ; if (shared) { m = m_copypacket(m0, M_DONTWAIT); if (m == NULL) /* copy failed, give up */ return m0; } else { m = m0 ; /* pass the original to dummynet */ m0 = NULL ; /* and nothing back to the caller */ } /* * Prepend the header, optimize for the common case of * eh pointing into the mbuf. */ if ( (void *)(eh + 1) == (void *)m->m_data) { m->m_data -= ETHER_HDR_LEN ; m->m_len += ETHER_HDR_LEN ; m->m_pkthdr.len += ETHER_HDR_LEN ; bdg_predict++; } else { M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT); if (m == NULL) /* nope... */ return m0 ; bcopy(&save_eh, mtod(m, struct ether_header *), ETHER_HDR_LEN); } args.oif = real_dst; ip_dn_io_ptr(m, (i & 0xffff),DN_TO_BDG_FWD, &args); return m0 ; } /* * XXX at some point, add support for divert/forward actions. * If none of the above matches, we have to drop the packet. */ bdg_ipfw_drops++ ; return m0 ; } forward: /* * Again, bring up the headers in case of shared bufs to avoid * corruptions in the future. */ if ( shared ) { int i = min(m0->m_pkthdr.len, max_protohdr) ; m0 = m_pullup(m0, i) ; if (m0 == NULL) return NULL ; } /* * now real_dst is used to determine the cluster where to forward. * For packets coming from ether_input, this is the one of the 'src' * interface, whereas for locally generated packets (src==NULL) it * is the cluster of the original destination interface, which * was already saved into real_dst. */ if (src != NULL) real_dst = src ; last = NULL; for (;;) { if (last) { /* need to forward packet leftover from previous loop */ struct mbuf *m ; if (shared == 0 && once ) { /* no need to copy */ m = m0 ; m0 = NULL ; /* original is gone */ } else { m = m_copypacket(m0, M_DONTWAIT); if (m == NULL) { printf("bdg_forward: sorry, m_copypacket failed!\n"); return m0 ; /* the original is still there... */ } } /* * Add header (optimized for the common case of eh pointing * already into the mbuf) and execute last part of ether_output: * queue pkt and start output if interface not yet active. */ if ( (void *)(eh + 1) == (void *)m->m_data) { m->m_data -= ETHER_HDR_LEN ; m->m_len += ETHER_HDR_LEN ; m->m_pkthdr.len += ETHER_HDR_LEN ; bdg_predict++; } else { M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT); if (!m && verbose) printf("M_PREPEND failed\n"); if (m == NULL) return m0; bcopy(&save_eh, mtod(m, struct ether_header *), ETHER_HDR_LEN); } if (!IF_HANDOFF(&last->if_snd, m, last)) { #if 0 BDG_MUTE(last); /* should I also mute ? */ #endif } BDG_STAT(last, BDG_OUT); last = NULL ; if (once) break ; } if (ifp == NULL) break ; /* * If the interface is used for bridging, not muted, not full, * up and running, is not the source interface, and belongs to * the same cluster as the 'real_dst', then send here. */ if ( BDG_USED(ifp) && !BDG_MUTED(ifp) && !_IF_QFULL(&ifp->if_snd) && (ifp->if_flags & (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING) && ifp != src && BDG_SAMECLUSTER(ifp, real_dst) ) last = ifp ; ifp = TAILQ_NEXT(ifp, if_link) ; if (ifp == NULL) once = 1 ; } DEB(bdg_fw_ticks += (u_long)(rdtsc() - ticks) ; bdg_fw_count++ ; if (bdg_fw_count != 0) bdg_fw_avg = bdg_fw_ticks/bdg_fw_count; ) return m0 ; } /* * initialization of bridge code. */ static int bdginit(void) { printf("BRIDGE 020214 loaded\n"); ifp2sc = malloc(BDG_MAX_PORTS * sizeof(struct bdg_softc), M_IFADDR, M_WAITOK | M_ZERO ); if (ifp2sc == NULL) return ENOMEM ; bridge_in_ptr = bridge_in; bdg_forward_ptr = bdg_forward; bdgtakeifaces_ptr = reconfigure_bridge; n_clusters = 0; clusters = NULL; do_bridge=0; bzero(&bdg_stats, sizeof(bdg_stats) ); bdgtakeifaces_ptr(); bdg_timeout(0); return 0 ; } /* * initialization code, both for static and dynamic loading. */ static int bridge_modevent(module_t mod, int type, void *unused) { int s; int err = 0 ; switch (type) { case MOD_LOAD: if (BDG_LOADED) { err = EEXIST; break ; } s = splimp(); err = bdginit(); splx(s); break; case MOD_UNLOAD: #if !defined(KLD_MODULE) printf("bridge statically compiled, cannot unload\n"); err = EINVAL ; #else s = splimp(); do_bridge = 0; bridge_in_ptr = NULL; bdg_forward_ptr = NULL; bdgtakeifaces_ptr = NULL; untimeout(bdg_timeout, NULL, bdg_timeout_h); bridge_off(); if (clusters) free(clusters, M_IFADDR); free(ifp2sc, M_IFADDR); ifp2sc = NULL ; splx(s); #endif break; default: err = EINVAL ; break; } return err; } static moduledata_t bridge_mod = { "bridge", bridge_modevent, 0 }; DECLARE_MODULE(bridge, bridge_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(bridge, 1); Index: head/sys/net/if_gre.c =================================================================== --- head/sys/net/if_gre.c (revision 105193) +++ head/sys/net/if_gre.c (revision 105194) @@ -1,756 +1,756 @@ /* $NetBSD: if_gre.c,v 1.42 2002/08/14 00:23:27 itojun Exp $ */ /* $FreeBSD$ */ /* * Copyright (c) 1998 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Heiko W.Rupp * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the NetBSD * Foundation, Inc. and its contributors. * 4. Neither the name of The NetBSD Foundation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Encapsulate L3 protocols into IP * See RFC 1701 and 1702 for more details. * If_gre is compatible with Cisco GRE tunnels, so you can * have a NetBSD box as the other end of a tunnel interface of a Cisco * router. See gre(4) for more details. * Also supported: IP in IP encaps (proto 55) as of RFC 2004 */ #include "opt_atalk.h" #include "opt_inet.h" #include "opt_ns.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #include #include #include #include #include #else #error "Huh? if_gre without inet?" #endif #include #include #include /* * It is not easy to calculate the right value for a GRE MTU. * We leave this task to the admin and use the same default that * other vendors use. */ #define GREMTU 1476 #define GRENAME "gre" static MALLOC_DEFINE(M_GRE, GRENAME, "Generic Routing Encapsulation"); struct gre_softc_head gre_softc_list; static int gre_clone_create __P((struct if_clone *, int)); static void gre_clone_destroy __P((struct ifnet *)); static int gre_ioctl(struct ifnet *, u_long, caddr_t); static int gre_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *rt); static struct if_clone gre_cloner = IF_CLONE_INITIALIZER("gre", gre_clone_create, gre_clone_destroy, 0, IF_MAXUNIT); static int gre_compute_route(struct gre_softc *sc); static void greattach __P((void)); #ifdef INET extern struct domain inetdomain; static const struct protosw in_gre_protosw = { SOCK_RAW, &inetdomain, IPPROTO_GRE, PR_ATOMIC|PR_ADDR, (pr_input_t*)gre_input, (pr_output_t*)rip_output, rip_ctlinput, rip_ctloutput, 0, 0, 0, 0, 0, &rip_usrreqs }; static const struct protosw in_mobile_protosw = { SOCK_RAW, &inetdomain, IPPROTO_MOBILE, PR_ATOMIC|PR_ADDR, (pr_input_t*)gre_mobile_input, (pr_output_t*)rip_output, rip_ctlinput, rip_ctloutput, 0, 0, 0, 0, 0, &rip_usrreqs }; #endif SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_OTHER, gre, CTLFLAG_RW, 0, "Generic Routing Encapsulation"); #ifndef MAX_GRE_NEST /* * This macro controls the default upper limitation on nesting of gre tunnels. * Since, setting a large value to this macro with a careless configuration * may introduce system crash, we don't allow any nestings by default. * If you need to configure nested gre tunnels, you can define this macro * in your kernel configuration file. However, if you do so, please be * careful to configure the tunnels so that it won't make a loop. */ #define MAX_GRE_NEST 1 #endif static int max_gre_nesting = MAX_GRE_NEST; SYSCTL_INT(_net_link_gre, OID_AUTO, max_nesting, CTLFLAG_RW, &max_gre_nesting, 0, "Max nested tunnels"); /* ARGSUSED */ static void greattach(void) { LIST_INIT(&gre_softc_list); if_clone_attach(&gre_cloner); } static int gre_clone_create(ifc, unit) struct if_clone *ifc; int unit; { struct gre_softc *sc; sc = malloc(sizeof(struct gre_softc), M_GRE, M_WAITOK); memset(sc, 0, sizeof(struct gre_softc)); sc->sc_if.if_name = GRENAME; sc->sc_if.if_softc = sc; sc->sc_if.if_unit = unit; sc->sc_if.if_snd.ifq_maxlen = IFQ_MAXLEN; sc->sc_if.if_type = IFT_OTHER; sc->sc_if.if_addrlen = 0; sc->sc_if.if_hdrlen = 24; /* IP + GRE */ sc->sc_if.if_mtu = GREMTU; sc->sc_if.if_flags = IFF_POINTOPOINT|IFF_MULTICAST; sc->sc_if.if_output = gre_output; sc->sc_if.if_ioctl = gre_ioctl; sc->g_dst.s_addr = sc->g_src.s_addr = INADDR_ANY; sc->g_proto = IPPROTO_GRE; sc->sc_if.if_flags |= IFF_LINK0; sc->encap = NULL; sc->called = 0; if_attach(&sc->sc_if); bpfattach(&sc->sc_if, DLT_NULL, sizeof(u_int32_t)); LIST_INSERT_HEAD(&gre_softc_list, sc, sc_list); return (0); } static void gre_clone_destroy(ifp) struct ifnet *ifp; { struct gre_softc *sc = ifp->if_softc; #ifdef INET if (sc->encap != NULL) encap_detach(sc->encap); #endif LIST_REMOVE(sc, sc_list); bpfdetach(ifp); if_detach(ifp); free(sc, M_GRE); } /* * The output routine. Takes a packet and encapsulates it in the protocol * given by sc->g_proto. See also RFC 1701 and RFC 2004 */ static int gre_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct rtentry *rt) { int error = 0; struct gre_softc *sc = ifp->if_softc; struct greip *gh; struct ip *ip; u_char osrc; u_short etype = 0; struct mobile_h mob_h; /* * gre may cause infinite recursion calls when misconfigured. * We'll prevent this by introducing upper limit. */ if (++(sc->called) > max_gre_nesting) { printf("%s: gre_output: recursively called too many " "times(%d)\n", if_name(&sc->sc_if), sc->called); m_freem(m); error = EIO; /* is there better errno? */ goto end; } if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) == 0 || sc->g_src.s_addr == INADDR_ANY || sc->g_dst.s_addr == INADDR_ANY) { m_freem(m); error = ENETDOWN; goto end; } gh = NULL; ip = NULL; osrc = 0; if (ifp->if_bpf) { /* see comment of other if_foo.c files */ struct mbuf m0; u_int32_t af = dst->sa_family; m0.m_next = m; m0.m_len = 4; m0.m_data = (char *)⁡ bpf_mtap(ifp, &m0); } m->m_flags &= ~(M_BCAST|M_MCAST); if (sc->g_proto == IPPROTO_MOBILE) { if (dst->sa_family == AF_INET) { struct mbuf *m0; int msiz; ip = mtod(m, struct ip *); /* * RFC2004 specifies that fragmented diagrams shouldn't * be encapsulated. */ if ((ip->ip_off & IP_MF) != 0) { _IF_DROP(&ifp->if_snd); m_freem(m); error = EINVAL; /* is there better errno? */ goto end; } memset(&mob_h, 0, MOB_H_SIZ_L); mob_h.proto = (ip->ip_p) << 8; mob_h.odst = ip->ip_dst.s_addr; ip->ip_dst.s_addr = sc->g_dst.s_addr; /* * If the packet comes from our host, we only change * the destination address in the IP header. * Else we also need to save and change the source */ if (in_hosteq(ip->ip_src, sc->g_src)) { msiz = MOB_H_SIZ_S; } else { mob_h.proto |= MOB_H_SBIT; mob_h.osrc = ip->ip_src.s_addr; ip->ip_src.s_addr = sc->g_src.s_addr; msiz = MOB_H_SIZ_L; } mob_h.proto = htons(mob_h.proto); mob_h.hcrc = gre_in_cksum((u_short *)&mob_h, msiz); if ((m->m_data - msiz) < m->m_pktdat) { /* need new mbuf */ MGETHDR(m0, M_DONTWAIT, MT_HEADER); if (m0 == NULL) { _IF_DROP(&ifp->if_snd); m_freem(m); error = ENOBUFS; goto end; } m0->m_next = m; m->m_data += sizeof(struct ip); m->m_len -= sizeof(struct ip); m0->m_pkthdr.len = m->m_pkthdr.len + msiz; m0->m_len = msiz + sizeof(struct ip); m0->m_data += max_linkhdr; memcpy(mtod(m0, caddr_t), (caddr_t)ip, sizeof(struct ip)); m = m0; } else { /* we have some space left in the old one */ m->m_data -= msiz; m->m_len += msiz; m->m_pkthdr.len += msiz; bcopy(ip, mtod(m, caddr_t), sizeof(struct ip)); } ip = mtod(m, struct ip *); memcpy((caddr_t)(ip + 1), &mob_h, (unsigned)msiz); ip->ip_len = ntohs(ip->ip_len) + msiz; } else { /* AF_INET */ _IF_DROP(&ifp->if_snd); m_freem(m); error = EINVAL; goto end; } } else if (sc->g_proto == IPPROTO_GRE) { switch (dst->sa_family) { case AF_INET: ip = mtod(m, struct ip *); etype = ETHERTYPE_IP; break; #ifdef NETATALK case AF_APPLETALK: etype = ETHERTYPE_ATALK; break; #endif #ifdef NS case AF_NS: etype = ETHERTYPE_NS; break; #endif default: _IF_DROP(&ifp->if_snd); m_freem(m); error = EAFNOSUPPORT; goto end; } M_PREPEND(m, sizeof(struct greip), M_DONTWAIT); } else { _IF_DROP(&ifp->if_snd); m_freem(m); error = EINVAL; goto end; } if (m == NULL) { /* impossible */ _IF_DROP(&ifp->if_snd); error = ENOBUFS; goto end; } gh = mtod(m, struct greip *); if (sc->g_proto == IPPROTO_GRE) { /* we don't have any GRE flags for now */ memset((void *)&gh->gi_g, 0, sizeof(struct gre_h)); gh->gi_ptype = htons(etype); } gh->gi_pr = sc->g_proto; if (sc->g_proto != IPPROTO_MOBILE) { gh->gi_src = sc->g_src; gh->gi_dst = sc->g_dst; ((struct ip*)gh)->ip_hl = (sizeof(struct ip)) >> 2; ((struct ip*)gh)->ip_ttl = GRE_TTL; ((struct ip*)gh)->ip_tos = ip->ip_tos; ((struct ip*)gh)->ip_id = ip->ip_id; gh->gi_len = m->m_pkthdr.len; } ifp->if_opackets++; ifp->if_obytes += m->m_pkthdr.len; /* send it off */ - error = ip_output(m, NULL, &sc->route, 0, NULL); + error = ip_output(m, NULL, &sc->route, 0, NULL, NULL); end: sc->called = 0; if (error) ifp->if_oerrors++; return (error); } static int gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct if_laddrreq *lifr = (struct if_laddrreq *)data; struct in_aliasreq *aifr = (struct in_aliasreq *)data; struct gre_softc *sc = ifp->if_softc; int s; struct sockaddr_in si; struct sockaddr *sa = NULL; int error; struct sockaddr_in sp, sm, dp, dm; error = 0; s = splnet(); switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; break; case SIOCSIFDSTADDR: break; case SIOCSIFFLAGS: if ((error = suser(curthread)) != 0) break; if ((ifr->ifr_flags & IFF_LINK0) != 0) sc->g_proto = IPPROTO_GRE; else sc->g_proto = IPPROTO_MOBILE; goto recompute; case SIOCSIFMTU: if ((error = suser(curthread)) != 0) break; if (ifr->ifr_mtu < 576) { error = EINVAL; break; } ifp->if_mtu = ifr->ifr_mtu; break; case SIOCGIFMTU: ifr->ifr_mtu = sc->sc_if.if_mtu; break; case SIOCADDMULTI: case SIOCDELMULTI: if ((error = suser(curthread)) != 0) break; if (ifr == 0) { error = EAFNOSUPPORT; break; } switch (ifr->ifr_addr.sa_family) { #ifdef INET case AF_INET: break; #endif default: error = EAFNOSUPPORT; break; } break; case GRESPROTO: if ((error = suser(curthread)) != 0) break; sc->g_proto = ifr->ifr_flags; switch (sc->g_proto) { case IPPROTO_GRE: ifp->if_flags |= IFF_LINK0; break; case IPPROTO_MOBILE: ifp->if_flags &= ~IFF_LINK0; break; default: error = EPROTONOSUPPORT; break; } goto recompute; case GREGPROTO: ifr->ifr_flags = sc->g_proto; break; case GRESADDRS: case GRESADDRD: if ((error = suser(curthread)) != 0) break; /* * set tunnel endpoints, compute a less specific route * to the remote end and mark if as up */ sa = &ifr->ifr_addr; if (cmd == GRESADDRS) sc->g_src = (satosin(sa))->sin_addr; if (cmd == GRESADDRD) sc->g_dst = (satosin(sa))->sin_addr; recompute: #ifdef INET if (sc->encap != NULL) { encap_detach(sc->encap); sc->encap = NULL; } #endif if ((sc->g_src.s_addr != INADDR_ANY) && (sc->g_dst.s_addr != INADDR_ANY)) { bzero(&sp, sizeof(sp)); bzero(&sm, sizeof(sm)); bzero(&dp, sizeof(dp)); bzero(&dm, sizeof(dm)); sp.sin_len = sm.sin_len = dp.sin_len = dm.sin_len = sizeof(struct sockaddr_in); sp.sin_family = sm.sin_family = dp.sin_family = dm.sin_family = AF_INET; sp.sin_addr = sc->g_src; dp.sin_addr = sc->g_dst; sm.sin_addr.s_addr = dm.sin_addr.s_addr = INADDR_BROADCAST; #ifdef INET sc->encap = encap_attach(AF_INET, sc->g_proto, sintosa(&sp), sintosa(&sm), sintosa(&dp), sintosa(&dm), (sc->g_proto == IPPROTO_GRE) ? &in_gre_protosw : &in_mobile_protosw, sc); if (sc->encap == NULL) printf("%s: unable to attach encap\n", if_name(&sc->sc_if)); #endif if (sc->route.ro_rt != 0) /* free old route */ RTFREE(sc->route.ro_rt); if (gre_compute_route(sc) == 0) ifp->if_flags |= IFF_RUNNING; else ifp->if_flags &= ~IFF_RUNNING; } break; case GREGADDRS: memset(&si, 0, sizeof(si)); si.sin_family = AF_INET; si.sin_len = sizeof(struct sockaddr_in); si.sin_addr.s_addr = sc->g_src.s_addr; sa = sintosa(&si); ifr->ifr_addr = *sa; break; case GREGADDRD: memset(&si, 0, sizeof(si)); si.sin_family = AF_INET; si.sin_len = sizeof(struct sockaddr_in); si.sin_addr.s_addr = sc->g_dst.s_addr; sa = sintosa(&si); ifr->ifr_addr = *sa; break; case SIOCSIFPHYADDR: if ((error = suser(curthread)) != 0) break; if (aifr->ifra_addr.sin_family != AF_INET || aifr->ifra_dstaddr.sin_family != AF_INET) { error = EAFNOSUPPORT; break; } if (aifr->ifra_addr.sin_len != sizeof(si) || aifr->ifra_dstaddr.sin_len != sizeof(si)) { error = EINVAL; break; } sc->g_src = aifr->ifra_addr.sin_addr; sc->g_dst = aifr->ifra_dstaddr.sin_addr; goto recompute; case SIOCSLIFPHYADDR: if ((error = suser(curthread)) != 0) break; if (lifr->addr.ss_family != AF_INET || lifr->dstaddr.ss_family != AF_INET) { error = EAFNOSUPPORT; break; } if (lifr->addr.ss_len != sizeof(si) || lifr->dstaddr.ss_len != sizeof(si)) { error = EINVAL; break; } sc->g_src = (satosin((struct sockadrr *)&lifr->addr))->sin_addr; sc->g_dst = (satosin((struct sockadrr *)&lifr->dstaddr))->sin_addr; goto recompute; case SIOCDIFPHYADDR: if ((error = suser(curthread)) != 0) break; sc->g_src.s_addr = INADDR_ANY; sc->g_dst.s_addr = INADDR_ANY; goto recompute; case SIOCGLIFPHYADDR: if (sc->g_src.s_addr == INADDR_ANY || sc->g_dst.s_addr == INADDR_ANY) { error = EADDRNOTAVAIL; break; } memset(&si, 0, sizeof(si)); si.sin_family = AF_INET; si.sin_len = sizeof(struct sockaddr_in); si.sin_addr.s_addr = sc->g_src.s_addr; memcpy(&lifr->addr, &si, sizeof(si)); si.sin_addr.s_addr = sc->g_dst.s_addr; memcpy(&lifr->dstaddr, &si, sizeof(si)); break; case SIOCGIFPSRCADDR: if (sc->g_src.s_addr == INADDR_ANY) { error = EADDRNOTAVAIL; break; } memset(&si, 0, sizeof(si)); si.sin_family = AF_INET; si.sin_len = sizeof(struct sockaddr_in); si.sin_addr.s_addr = sc->g_src.s_addr; bcopy(&si, &ifr->ifr_addr, sizeof(ifr->ifr_addr)); break; case SIOCGIFPDSTADDR: if (sc->g_dst.s_addr == INADDR_ANY) { error = EADDRNOTAVAIL; break; } memset(&si, 0, sizeof(si)); si.sin_family = AF_INET; si.sin_len = sizeof(struct sockaddr_in); si.sin_addr.s_addr = sc->g_dst.s_addr; bcopy(&si, &ifr->ifr_addr, sizeof(ifr->ifr_addr)); break; default: error = EINVAL; break; } splx(s); return (error); } /* * computes a route to our destination that is not the one * which would be taken by ip_output(), as this one will loop back to * us. If the interface is p2p as a--->b, then a routing entry exists * If we now send a packet to b (e.g. ping b), this will come down here * gets src=a, dst=b tacked on and would from ip_ouput() sent back to * if_gre. * Goal here is to compute a route to b that is less specific than * a-->b. We know that this one exists as in normal operation we have * at least a default route which matches. */ static int gre_compute_route(struct gre_softc *sc) { struct route *ro; u_int32_t a, b, c; ro = &sc->route; memset(ro, 0, sizeof(struct route)); ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = sc->g_dst; ro->ro_dst.sa_family = AF_INET; ro->ro_dst.sa_len = sizeof(ro->ro_dst); /* * toggle last bit, so our interface is not found, but a less * specific route. I'd rather like to specify a shorter mask, * but this is not possible. Should work though. XXX * there is a simpler way ... */ if ((sc->sc_if.if_flags & IFF_LINK1) == 0) { a = ntohl(sc->g_dst.s_addr); b = a & 0x01; c = a & 0xfffffffe; b = b ^ 0x01; a = b | c; ((struct sockaddr_in *)&ro->ro_dst)->sin_addr.s_addr = htonl(a); } #ifdef DIAGNOSTIC printf("%s: searching a route to %s", if_name(&sc->sc_if), inet_ntoa(((struct sockaddr_in *)&ro->ro_dst)->sin_addr)); #endif rtalloc(ro); /* * check if this returned a route at all and this route is no * recursion to ourself */ if (ro->ro_rt == NULL || ro->ro_rt->rt_ifp->if_softc == sc) { #ifdef DIAGNOSTIC if (ro->ro_rt == NULL) printf(" - no route found!\n"); else printf(" - route loops back to ourself!\n"); #endif return EADDRNOTAVAIL; } /* * now change it back - else ip_output will just drop * the route and search one to this interface ... */ if ((sc->sc_if.if_flags & IFF_LINK1) == 0) ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = sc->g_dst; #ifdef DIAGNOSTIC printf(", choosing %s with gateway %s", if_name(ro->ro_rt->rt_ifp), inet_ntoa(((struct sockaddr_in *)(ro->ro_rt->rt_gateway))->sin_addr)); printf("\n"); #endif return 0; } /* * do a checksum of a buffer - much like in_cksum, which operates on * mbufs. */ u_short gre_in_cksum(u_short *p, u_int len) { u_int sum = 0; int nwords = len >> 1; while (nwords-- != 0) sum += *p++; if (len & 1) { union { u_short w; u_char c[2]; } u; u.c[0] = *(u_char *)p; u.c[1] = 0; sum += u.w; } /* end-around-carry */ sum = (sum >> 16) + (sum & 0xffff); sum += (sum >> 16); return (~sum); } static int gremodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: greattach(); break; case MOD_UNLOAD: if_clone_detach(&gre_cloner); while (!LIST_EMPTY(&gre_softc_list)) gre_clone_destroy(&LIST_FIRST(&gre_softc_list)->sc_if); break; } return 0; } static moduledata_t gre_mod = { "if_gre", gremodevent, 0 }; DECLARE_MODULE(if_gre, gre_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_gre, 1); Index: head/sys/net/if_loop.c =================================================================== --- head/sys/net/if_loop.c (revision 105193) +++ head/sys/net/if_loop.c (revision 105194) @@ -1,449 +1,448 @@ /* * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_loop.c 8.2 (Berkeley) 1/9/95 * $FreeBSD$ */ /* * Loopback interface driver for protocol testing and timing. */ #include "opt_atalk.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipx.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #ifdef IPX #include #include #endif #ifdef INET6 #ifndef INET #include #endif #include #include #endif #ifdef NS #include #include #endif #ifdef NETATALK #include #include #endif #ifdef TINY_LOMTU #define LOMTU (1024+512) #elif defined(LARGE_LOMTU) #define LOMTU 131072 #else #define LOMTU 16384 #endif #define LONAME "lo" struct lo_softc { struct ifnet sc_if; /* network-visible interface */ LIST_ENTRY(lo_softc) sc_next; }; int loioctl(struct ifnet *, u_long, caddr_t); static void lortrequest(int, struct rtentry *, struct rt_addrinfo *); int looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct rtentry *rt); int lo_clone_create(struct if_clone *, int); void lo_clone_destroy(struct ifnet *); struct ifnet *loif = NULL; /* Used externally */ static MALLOC_DEFINE(M_LO, LONAME, "Loopback Interface"); static LIST_HEAD(lo_list, lo_softc) lo_list; struct if_clone lo_cloner = IF_CLONE_INITIALIZER(LONAME, lo_clone_create, lo_clone_destroy, 1, IF_MAXUNIT); void lo_clone_destroy(ifp) struct ifnet *ifp; { struct lo_softc *sc; sc = ifp->if_softc; /* XXX: destroying lo0 will lead to panics. */ KASSERT(loif != ifp, ("%s: destroying lo0", __func__)); bpfdetach(ifp); if_detach(ifp); LIST_REMOVE(sc, sc_next); free(sc, M_LO); } int lo_clone_create(ifc, unit) struct if_clone *ifc; int unit; { struct lo_softc *sc; MALLOC(sc, struct lo_softc *, sizeof(*sc), M_LO, M_WAITOK | M_ZERO); sc->sc_if.if_name = LONAME; sc->sc_if.if_unit = unit; sc->sc_if.if_mtu = LOMTU; sc->sc_if.if_flags = IFF_LOOPBACK | IFF_MULTICAST; sc->sc_if.if_ioctl = loioctl; sc->sc_if.if_output = looutput; sc->sc_if.if_type = IFT_LOOP; sc->sc_if.if_snd.ifq_maxlen = ifqmaxlen; sc->sc_if.if_softc = sc; if_attach(&sc->sc_if); bpfattach(&sc->sc_if, DLT_NULL, sizeof(u_int)); LIST_INSERT_HEAD(&lo_list, sc, sc_next); if (loif == NULL) loif = &sc->sc_if; return (0); } static int loop_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: LIST_INIT(&lo_list); if_clone_attach(&lo_cloner); break; case MOD_UNLOAD: printf("loop module unload - not possible for this module type\n"); return EINVAL; } return 0; } static moduledata_t loop_mod = { "loop", loop_modevent, 0 }; DECLARE_MODULE(loop, loop_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); int looutput(ifp, m, dst, rt) struct ifnet *ifp; register struct mbuf *m; struct sockaddr *dst; register struct rtentry *rt; { if ((m->m_flags & M_PKTHDR) == 0) panic("looutput no HDR"); if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { m_freem(m); return (rt->rt_flags & RTF_BLACKHOLE ? 0 : rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); } /* * KAME requires that the packet to be contiguous on the * mbuf. We need to make that sure. * this kind of code should be avoided. * XXX: fails to join if interface MTU > MCLBYTES. jumbogram? */ if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) { struct mbuf *n; MGETHDR(n, M_DONTWAIT, MT_HEADER); if (!n) goto contiguousfail; MCLGET(n, M_DONTWAIT); if (! (n->m_flags & M_EXT)) { m_freem(n); goto contiguousfail; } m_copydata(m, 0, m->m_pkthdr.len, mtod(n, caddr_t)); n->m_pkthdr = m->m_pkthdr; n->m_len = m->m_pkthdr.len; - n->m_pkthdr.aux = m->m_pkthdr.aux; - m->m_pkthdr.aux = (struct mbuf *)NULL; + SLIST_INIT(&m->m_pkthdr.tags); m_freem(m); m = n; } if (0) { contiguousfail: printf("looutput: mbuf allocation failed\n"); } ifp->if_opackets++; ifp->if_obytes += m->m_pkthdr.len; #if 1 /* XXX */ switch (dst->sa_family) { case AF_INET: case AF_INET6: case AF_IPX: case AF_NS: case AF_APPLETALK: break; default: printf("looutput: af=%d unexpected\n", dst->sa_family); m_freem(m); return (EAFNOSUPPORT); } #endif return(if_simloop(ifp, m, dst->sa_family, 0)); } /* * if_simloop() * * This function is to support software emulation of hardware loopback, * i.e., for interfaces with the IFF_SIMPLEX attribute. Since they can't * hear their own broadcasts, we create a copy of the packet that we * would normally receive via a hardware loopback. * * This function expects the packet to include the media header of length hlen. */ int if_simloop(ifp, m, af, hlen) struct ifnet *ifp; struct mbuf *m; int af; int hlen; { int isr; struct ifqueue *inq = 0; KASSERT((m->m_flags & M_PKTHDR) != 0, ("if_simloop: no HDR")); m->m_pkthdr.rcvif = ifp; /* BPF write needs to be handled specially */ if (af == AF_UNSPEC) { KASSERT(m->m_len >= sizeof(int), ("if_simloop: m_len")); af = *(mtod(m, int *)); m->m_len -= sizeof(int); m->m_pkthdr.len -= sizeof(int); m->m_data += sizeof(int); } /* Let BPF see incoming packet */ if (ifp->if_bpf) { struct mbuf m0, *n = m; if (ifp->if_bpf->bif_dlt == DLT_NULL) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ m0.m_next = m; m0.m_len = 4; m0.m_data = (char *)⁡ n = &m0; } bpf_mtap(ifp, n); } /* Strip away media header */ if (hlen > 0) { m_adj(m, hlen); #if defined(__alpha__) || defined(__ia64__) || defined(__sparc64__) /* The alpha doesn't like unaligned data. * We move data down in the first mbuf */ if (mtod(m, vm_offset_t) & 3) { KASSERT(hlen >= 3, ("if_simloop: hlen too small")); bcopy(m->m_data, (char *)(mtod(m, vm_offset_t) - (mtod(m, vm_offset_t) & 3)), m->m_len); mtod(m,vm_offset_t) -= (mtod(m, vm_offset_t) & 3); } #endif } /* Deliver to upper layer protocol */ switch (af) { #ifdef INET case AF_INET: inq = &ipintrq; isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: m->m_flags |= M_LOOP; inq = &ip6intrq; isr = NETISR_IPV6; break; #endif #ifdef IPX case AF_IPX: inq = &ipxintrq; isr = NETISR_IPX; break; #endif #ifdef NS case AF_NS: inq = &nsintrq; isr = NETISR_NS; break; #endif #ifdef NETATALK case AF_APPLETALK: inq = &atintrq2; isr = NETISR_ATALK; break; #endif default: printf("if_simloop: can't handle af=%d\n", af); m_freem(m); return (EAFNOSUPPORT); } ifp->if_ipackets++; ifp->if_ibytes += m->m_pkthdr.len; (void) IF_HANDOFF(inq, m, NULL); schednetisr(isr); return (0); } /* ARGSUSED */ static void lortrequest(cmd, rt, info) int cmd; struct rtentry *rt; struct rt_addrinfo *info; { if (rt) { rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; /* for ISO */ /* * For optimal performance, the send and receive buffers * should be at least twice the MTU plus a little more for * overhead. */ rt->rt_rmx.rmx_recvpipe = rt->rt_rmx.rmx_sendpipe = 3 * LOMTU; } } /* * Process an ioctl request. */ /* ARGSUSED */ int loioctl(ifp, cmd, data) register struct ifnet *ifp; u_long cmd; caddr_t data; { register struct ifaddr *ifa; register struct ifreq *ifr = (struct ifreq *)data; register int error = 0; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP | IFF_RUNNING; ifa = (struct ifaddr *)data; ifa->ifa_rtrequest = lortrequest; /* * Everything else is done at a higher level. */ break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifr == 0) { error = EAFNOSUPPORT; /* XXX */ break; } switch (ifr->ifr_addr.sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: error = EAFNOSUPPORT; break; } break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; break; case SIOCSIFFLAGS: break; default: error = EINVAL; } return (error); } Index: head/sys/net/if_stf.c =================================================================== --- head/sys/net/if_stf.c (revision 105193) +++ head/sys/net/if_stf.c (revision 105194) @@ -1,750 +1,750 @@ /* $FreeBSD$ */ /* $KAME: if_stf.c,v 1.73 2001/12/03 11:08:30 keiichi Exp $ */ /* * Copyright (C) 2000 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * 6to4 interface, based on RFC3056. * * 6to4 interface is NOT capable of link-layer (I mean, IPv4) multicasting. * There is no address mapping defined from IPv6 multicast address to IPv4 * address. Therefore, we do not have IFF_MULTICAST on the interface. * * Due to the lack of address mapping for link-local addresses, we cannot * throw packets toward link-local addresses (fe80::x). Also, we cannot throw * packets to link-local multicast addresses (ff02::x). * * Here are interesting symptoms due to the lack of link-local address: * * Unicast routing exchange: * - RIPng: Impossible. Uses link-local multicast packet toward ff02::9, * and link-local addresses as nexthop. * - OSPFv6: Impossible. OSPFv6 assumes that there's link-local address * assigned to the link, and makes use of them. Also, HELLO packets use * link-local multicast addresses (ff02::5 and ff02::6). * - BGP4+: Maybe. You can only use global address as nexthop, and global * address as TCP endpoint address. * * Multicast routing protocols: * - PIM: Hello packet cannot be used to discover adjacent PIM routers. * Adjacent PIM routers must be configured manually (is it really spec-wise * correct thing to do?). * * ICMPv6: * - Redirects cannot be used due to the lack of link-local address. * * stf interface does not have, and will not need, a link-local address. * It seems to have no real benefit and does not help the above symptoms much. * Even if we assign link-locals to interface, we cannot really * use link-local unicast/multicast on top of 6to4 cloud (since there's no * encapsulation defined for link-local address), and the above analysis does * not change. RFC3056 does not mandate the assignment of link-local address * either. * * 6to4 interface has security issues. Refer to * http://playground.iijlab.net/i-d/draft-itojun-ipv6-transition-abuse-00.txt * for details. The code tries to filter out some of malicious packets. * Note that there is no way to be 100% secure. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define STFNAME "stf" #define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) #define GET_V4(x) ((struct in_addr *)(&(x)->s6_addr16[1])) struct stf_softc { struct ifnet sc_if; /* common area */ union { struct route __sc_ro4; struct route_in6 __sc_ro6; /* just for safety */ } __sc_ro46; #define sc_ro __sc_ro46.__sc_ro4 const struct encaptab *encap_cookie; LIST_ENTRY(stf_softc) sc_list; /* all stf's are linked */ }; static LIST_HEAD(, stf_softc) stf_softc_list; static MALLOC_DEFINE(M_STF, STFNAME, "6to4 Tunnel Interface"); static int ip_stf_ttl = 40; extern struct domain inetdomain; struct protosw in_stf_protosw = { SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR, in_stf_input, (pr_output_t*)rip_output, 0, rip_ctloutput, 0, 0, 0, 0, 0, &rip_usrreqs }; static int stfmodevent(module_t, int, void *); static int stf_encapcheck(const struct mbuf *, int, int, void *); static struct in6_ifaddr *stf_getsrcifa6(struct ifnet *); static int stf_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); static int isrfc1918addr(struct in_addr *); static int stf_checkaddr4(struct stf_softc *, struct in_addr *, struct ifnet *); static int stf_checkaddr6(struct stf_softc *, struct in6_addr *, struct ifnet *); static void stf_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int stf_ioctl(struct ifnet *, u_long, caddr_t); int stf_clone_create(struct if_clone *, int); void stf_clone_destroy(struct ifnet *); /* only one clone is currently allowed */ struct if_clone stf_cloner = IF_CLONE_INITIALIZER(STFNAME, stf_clone_create, stf_clone_destroy, 0, 0); int stf_clone_create(ifc, unit) struct if_clone *ifc; int unit; { struct stf_softc *sc; sc = malloc(sizeof(struct stf_softc), M_STF, M_WAITOK | M_ZERO); sc->sc_if.if_name = STFNAME; sc->sc_if.if_unit = unit; sc->encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV6, stf_encapcheck, &in_stf_protosw, sc); if (sc->encap_cookie == NULL) { printf("%s: attach failed\n", if_name(&sc->sc_if)); free(sc, M_STF); return (ENOMEM); } sc->sc_if.if_mtu = IPV6_MMTU; sc->sc_if.if_ioctl = stf_ioctl; sc->sc_if.if_output = stf_output; sc->sc_if.if_type = IFT_STF; sc->sc_if.if_snd.ifq_maxlen = IFQ_MAXLEN; if_attach(&sc->sc_if); bpfattach(&sc->sc_if, DLT_NULL, sizeof(u_int)); LIST_INSERT_HEAD(&stf_softc_list, sc, sc_list); return (0); } void stf_clone_destroy(ifp) struct ifnet *ifp; { int err; struct stf_softc *sc = (void *) ifp; LIST_REMOVE(sc, sc_list); err = encap_detach(sc->encap_cookie); KASSERT(err == 0, ("Unexpected error detaching encap_cookie")); bpfdetach(ifp); if_detach(ifp); free(sc, M_STF); } static int stfmodevent(mod, type, data) module_t mod; int type; void *data; { switch (type) { case MOD_LOAD: LIST_INIT(&stf_softc_list); if_clone_attach(&stf_cloner); break; case MOD_UNLOAD: if_clone_detach(&stf_cloner); while (!LIST_EMPTY(&stf_softc_list)) stf_clone_destroy(&LIST_FIRST(&stf_softc_list)->sc_if); break; } return (0); } static moduledata_t stf_mod = { "if_stf", stfmodevent, 0 }; DECLARE_MODULE(if_stf, stf_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); static int stf_encapcheck(m, off, proto, arg) const struct mbuf *m; int off; int proto; void *arg; { struct ip ip; struct in6_ifaddr *ia6; struct stf_softc *sc; struct in_addr a, b; sc = (struct stf_softc *)arg; if (sc == NULL) return 0; if ((sc->sc_if.if_flags & IFF_UP) == 0) return 0; /* IFF_LINK0 means "no decapsulation" */ if ((sc->sc_if.if_flags & IFF_LINK0) != 0) return 0; if (proto != IPPROTO_IPV6) return 0; /* LINTED const cast */ m_copydata((struct mbuf *)(uintptr_t)m, 0, sizeof(ip), (caddr_t)&ip); if (ip.ip_v != 4) return 0; ia6 = stf_getsrcifa6(&sc->sc_if); if (ia6 == NULL) return 0; /* * check if IPv4 dst matches the IPv4 address derived from the * local 6to4 address. * success on: dst = 10.1.1.1, ia6->ia_addr = 2002:0a01:0101:... */ if (bcmp(GET_V4(&ia6->ia_addr.sin6_addr), &ip.ip_dst, sizeof(ip.ip_dst)) != 0) return 0; /* * check if IPv4 src matches the IPv4 address derived from the * local 6to4 address masked by prefixmask. * success on: src = 10.1.1.1, ia6->ia_addr = 2002:0a00:.../24 * fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24 */ bzero(&a, sizeof(a)); a.s_addr = GET_V4(&ia6->ia_addr.sin6_addr)->s_addr; a.s_addr &= GET_V4(&ia6->ia_prefixmask.sin6_addr)->s_addr; b = ip.ip_src; b.s_addr &= GET_V4(&ia6->ia_prefixmask.sin6_addr)->s_addr; if (a.s_addr != b.s_addr) return 0; /* stf interface makes single side match only */ return 32; } static struct in6_ifaddr * stf_getsrcifa6(ifp) struct ifnet *ifp; { struct ifaddr *ia; struct in_ifaddr *ia4; struct sockaddr_in6 *sin6; struct in_addr in; for (ia = TAILQ_FIRST(&ifp->if_addrlist); ia; ia = TAILQ_NEXT(ia, ifa_list)) { if (ia->ifa_addr == NULL) continue; if (ia->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ia->ifa_addr; if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) continue; bcopy(GET_V4(&sin6->sin6_addr), &in, sizeof(in)); LIST_FOREACH(ia4, INADDR_HASH(in.s_addr), ia_hash) if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) break; if (ia4 == NULL) continue; return (struct in6_ifaddr *)ia; } return NULL; } static int stf_output(ifp, m, dst, rt) struct ifnet *ifp; struct mbuf *m; struct sockaddr *dst; struct rtentry *rt; { struct stf_softc *sc; struct sockaddr_in6 *dst6; struct in_addr *in4; struct sockaddr_in *dst4; u_int8_t tos; struct ip *ip; struct ip6_hdr *ip6; struct in6_ifaddr *ia6; sc = (struct stf_softc*)ifp; dst6 = (struct sockaddr_in6 *)dst; /* just in case */ if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); ifp->if_oerrors++; return ENETDOWN; } /* * If we don't have an ip4 address that match my inner ip6 address, * we shouldn't generate output. Without this check, we'll end up * using wrong IPv4 source. */ ia6 = stf_getsrcifa6(ifp); if (ia6 == NULL) { m_freem(m); ifp->if_oerrors++; return ENETDOWN; } if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) { ifp->if_oerrors++; return ENOBUFS; } } ip6 = mtod(m, struct ip6_hdr *); tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; /* * Pickup the right outer dst addr from the list of candidates. * ip6_dst has priority as it may be able to give us shorter IPv4 hops. */ if (IN6_IS_ADDR_6TO4(&ip6->ip6_dst)) in4 = GET_V4(&ip6->ip6_dst); else if (IN6_IS_ADDR_6TO4(&dst6->sin6_addr)) in4 = GET_V4(&dst6->sin6_addr); else { m_freem(m); ifp->if_oerrors++; return ENETUNREACH; } #if NBPFILTER > 0 if (ifp->if_bpf) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ struct mbuf m0; u_int32_t af = AF_INET6; m0.m_next = m; m0.m_len = 4; m0.m_data = (char *)⁡ #ifdef HAVE_OLD_BPF bpf_mtap(ifp, &m0); #else bpf_mtap(ifp->if_bpf, &m0); #endif } #endif /*NBPFILTER > 0*/ M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); if (m && m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { ifp->if_oerrors++; return ENOBUFS; } ip = mtod(m, struct ip *); bzero(ip, sizeof(*ip)); bcopy(GET_V4(&((struct sockaddr_in6 *)&ia6->ia_addr)->sin6_addr), &ip->ip_src, sizeof(ip->ip_src)); bcopy(in4, &ip->ip_dst, sizeof(ip->ip_dst)); ip->ip_p = IPPROTO_IPV6; ip->ip_ttl = ip_stf_ttl; ip->ip_len = m->m_pkthdr.len; /*host order*/ if (ifp->if_flags & IFF_LINK1) ip_ecn_ingress(ECN_ALLOWED, &ip->ip_tos, &tos); else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); dst4 = (struct sockaddr_in *)&sc->sc_ro.ro_dst; if (dst4->sin_family != AF_INET || bcmp(&dst4->sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)) != 0) { /* cache route doesn't match */ dst4->sin_family = AF_INET; dst4->sin_len = sizeof(struct sockaddr_in); bcopy(&ip->ip_dst, &dst4->sin_addr, sizeof(dst4->sin_addr)); if (sc->sc_ro.ro_rt) { RTFREE(sc->sc_ro.ro_rt); sc->sc_ro.ro_rt = NULL; } } if (sc->sc_ro.ro_rt == NULL) { rtalloc(&sc->sc_ro); if (sc->sc_ro.ro_rt == NULL) { m_freem(m); ifp->if_oerrors++; return ENETUNREACH; } } ifp->if_opackets++; - return ip_output(m, NULL, &sc->sc_ro, 0, NULL); + return ip_output(m, NULL, &sc->sc_ro, 0, NULL, NULL); } static int isrfc1918addr(in) struct in_addr *in; { /* * returns 1 if private address range: * 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 */ if ((ntohl(in->s_addr) & 0xff000000) >> 24 == 10 || (ntohl(in->s_addr) & 0xfff00000) >> 16 == 172 * 256 + 16 || (ntohl(in->s_addr) & 0xffff0000) >> 16 == 192 * 256 + 168) return 1; return 0; } static int stf_checkaddr4(sc, in, inifp) struct stf_softc *sc; struct in_addr *in; struct ifnet *inifp; /* incoming interface */ { struct in_ifaddr *ia4; /* * reject packets with the following address: * 224.0.0.0/4 0.0.0.0/8 127.0.0.0/8 255.0.0.0/8 */ if (IN_MULTICAST(ntohl(in->s_addr))) return -1; switch ((ntohl(in->s_addr) & 0xff000000) >> 24) { case 0: case 127: case 255: return -1; } /* * reject packets with private address range. * (requirement from RFC3056 section 2 1st paragraph) */ if (isrfc1918addr(in)) return -1; /* * reject packets with broadcast */ for (ia4 = TAILQ_FIRST(&in_ifaddrhead); ia4; ia4 = TAILQ_NEXT(ia4, ia_link)) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) return -1; } /* * perform ingress filter */ if (sc && (sc->sc_if.if_flags & IFF_LINK2) == 0 && inifp) { struct sockaddr_in sin; struct rtentry *rt; bzero(&sin, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr = *in; rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL); if (!rt || rt->rt_ifp != inifp) { #if 0 log(LOG_WARNING, "%s: packet from 0x%x dropped " "due to ingress filter\n", if_name(&sc->sc_if), (u_int32_t)ntohl(sin.sin_addr.s_addr)); #endif if (rt) rtfree(rt); return -1; } rtfree(rt); } return 0; } static int stf_checkaddr6(sc, in6, inifp) struct stf_softc *sc; struct in6_addr *in6; struct ifnet *inifp; /* incoming interface */ { /* * check 6to4 addresses */ if (IN6_IS_ADDR_6TO4(in6)) return stf_checkaddr4(sc, GET_V4(in6), inifp); /* * reject anything that look suspicious. the test is implemented * in ip6_input too, but we check here as well to * (1) reject bad packets earlier, and * (2) to be safe against future ip6_input change. */ if (IN6_IS_ADDR_V4COMPAT(in6) || IN6_IS_ADDR_V4MAPPED(in6)) return -1; return 0; } void in_stf_input(m, off) struct mbuf *m; int off; { int proto; struct stf_softc *sc; struct ip *ip; struct ip6_hdr *ip6; u_int8_t otos, itos; int len, isr; struct ifqueue *ifq = NULL; struct ifnet *ifp; proto = mtod(m, struct ip *)->ip_p; if (proto != IPPROTO_IPV6) { m_freem(m); return; } ip = mtod(m, struct ip *); sc = (struct stf_softc *)encap_getarg(m); if (sc == NULL || (sc->sc_if.if_flags & IFF_UP) == 0) { m_freem(m); return; } ifp = &sc->sc_if; /* * perform sanity check against outer src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr4(sc, &ip->ip_dst, NULL) < 0 || stf_checkaddr4(sc, &ip->ip_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); return; } otos = ip->ip_tos; m_adj(m, off); if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) return; } ip6 = mtod(m, struct ip6_hdr *); /* * perform sanity check against inner src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr6(sc, &ip6->ip6_dst, NULL) < 0 || stf_checkaddr6(sc, &ip6->ip6_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); return; } itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; if ((ifp->if_flags & IFF_LINK1) != 0) ip_ecn_egress(ECN_ALLOWED, &otos, &itos); else ip_ecn_egress(ECN_NOCARE, &otos, &itos); ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t)itos << 20); m->m_pkthdr.rcvif = ifp; if (ifp->if_bpf) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ struct mbuf m0; u_int32_t af = AF_INET6; m0.m_next = m; m0.m_len = 4; m0.m_data = (char *)⁡ #ifdef HAVE_OLD_BPF bpf_mtap(ifp, &m0); #else bpf_mtap(ifp->if_bpf, &m0); #endif } /* * Put the packet to the network layer input queue according to the * specified address family. * See net/if_gif.c for possible issues with packet processing * reorder due to extra queueing. */ ifq = &ip6intrq; isr = NETISR_IPV6; len = m->m_pkthdr.len; if (! IF_HANDOFF(ifq, m, NULL)) return; schednetisr(isr); ifp->if_ipackets++; ifp->if_ibytes += len; } /* ARGSUSED */ static void stf_rtrequest(cmd, rt, info) int cmd; struct rtentry *rt; struct rt_addrinfo *info; { if (rt) rt->rt_rmx.rmx_mtu = IPV6_MMTU; } static int stf_ioctl(ifp, cmd, data) struct ifnet *ifp; u_long cmd; caddr_t data; { struct ifaddr *ifa; struct ifreq *ifr; struct sockaddr_in6 *sin6; int error; error = 0; switch (cmd) { case SIOCSIFADDR: ifa = (struct ifaddr *)data; if (ifa == NULL || ifa->ifa_addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; break; } sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (IN6_IS_ADDR_6TO4(&sin6->sin6_addr) && !isrfc1918addr(GET_V4(&sin6->sin6_addr))) { ifa->ifa_rtrequest = stf_rtrequest; ifp->if_flags |= IFF_UP; } else error = EINVAL; break; case SIOCADDMULTI: case SIOCDELMULTI: ifr = (struct ifreq *)data; if (ifr && ifr->ifr_addr.sa_family == AF_INET6) ; else error = EAFNOSUPPORT; break; default: error = EINVAL; break; } return error; } Index: head/sys/netinet/igmp.c =================================================================== --- head/sys/netinet/igmp.c (revision 105193) +++ head/sys/netinet/igmp.c (revision 105194) @@ -1,493 +1,493 @@ /* * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 * $FreeBSD$ */ /* * Internet Group Management Protocol (IGMP) routines. * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb 1995. * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. * * MULTICAST Revision: 3.5.1.4 */ #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); static struct router_info * find_rti(struct ifnet *ifp); static struct igmpstat igmpstat; SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW, &igmpstat, igmpstat, ""); static int igmp_timers_are_running; static u_long igmp_all_hosts_group; static u_long igmp_all_rtrs_group; static struct mbuf *router_alert; static struct router_info *Head; static void igmp_sendpkt(struct in_multi *, int, unsigned long); void igmp_init() { struct ipoption *ra; /* * To avoid byte-swapping the same value over and over again. */ igmp_all_hosts_group = htonl(INADDR_ALLHOSTS_GROUP); igmp_all_rtrs_group = htonl(INADDR_ALLRTRS_GROUP); igmp_timers_are_running = 0; /* * Construct a Router Alert option to use in outgoing packets */ MGET(router_alert, M_DONTWAIT, MT_DATA); ra = mtod(router_alert, struct ipoption *); ra->ipopt_dst.s_addr = 0; ra->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ ra->ipopt_list[1] = 0x04; /* 4 bytes long */ ra->ipopt_list[2] = 0x00; ra->ipopt_list[3] = 0x00; router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1]; Head = (struct router_info *) 0; } static struct router_info * find_rti(ifp) struct ifnet *ifp; { register struct router_info *rti = Head; #ifdef IGMP_DEBUG printf("[igmp.c, _find_rti] --> entering \n"); #endif while (rti) { if (rti->rti_ifp == ifp) { #ifdef IGMP_DEBUG printf("[igmp.c, _find_rti] --> found old entry \n"); #endif return rti; } rti = rti->rti_next; } MALLOC(rti, struct router_info *, sizeof *rti, M_IGMP, M_NOWAIT); rti->rti_ifp = ifp; rti->rti_type = IGMP_V2_ROUTER; rti->rti_time = 0; rti->rti_next = Head; Head = rti; #ifdef IGMP_DEBUG printf("[igmp.c, _find_rti] --> created an entry \n"); #endif return rti; } void igmp_input(m, off) register struct mbuf *m; int off; { register int iphlen = off; register struct igmp *igmp; register struct ip *ip; register int igmplen; register struct ifnet *ifp = m->m_pkthdr.rcvif; register int minlen; register struct in_multi *inm; register struct in_ifaddr *ia; struct in_multistep step; struct router_info *rti; int timer; /** timer value in the igmp query header **/ ++igmpstat.igps_rcv_total; ip = mtod(m, struct ip *); igmplen = ip->ip_len; /* * Validate lengths */ if (igmplen < IGMP_MINLEN) { ++igmpstat.igps_rcv_tooshort; m_freem(m); return; } minlen = iphlen + IGMP_MINLEN; if ((m->m_flags & M_EXT || m->m_len < minlen) && (m = m_pullup(m, minlen)) == 0) { ++igmpstat.igps_rcv_tooshort; return; } /* * Validate checksum */ m->m_data += iphlen; m->m_len -= iphlen; igmp = mtod(m, struct igmp *); if (in_cksum(m, igmplen)) { ++igmpstat.igps_rcv_badsum; m_freem(m); return; } m->m_data -= iphlen; m->m_len += iphlen; ip = mtod(m, struct ip *); timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; rti = find_rti(ifp); /* * In the IGMPv2 specification, there are 3 states and a flag. * * In Non-Member state, we simply don't have a membership record. * In Delaying Member state, our timer is running (inm->inm_timer) * In Idle Member state, our timer is not running (inm->inm_timer==0) * * The flag is inm->inm_state, it is set to IGMP_OTHERMEMBER if * we have heard a report from another member, or IGMP_IREPORTEDLAST * if I sent the last report. */ switch (igmp->igmp_type) { case IGMP_MEMBERSHIP_QUERY: ++igmpstat.igps_rcv_queries; if (ifp->if_flags & IFF_LOOPBACK) break; if (igmp->igmp_code == 0) { /* * Old router. Remember that the querier on this * interface is old, and set the timer to the * value in RFC 1112. */ rti->rti_type = IGMP_V1_ROUTER; rti->rti_time = 0; timer = IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ; if (ip->ip_dst.s_addr != igmp_all_hosts_group || igmp->igmp_group.s_addr != 0) { ++igmpstat.igps_rcv_badqueries; m_freem(m); return; } } else { /* * New router. Simply do the new validity check. */ if (igmp->igmp_group.s_addr != 0 && !IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { ++igmpstat.igps_rcv_badqueries; m_freem(m); return; } } /* * - Start the timers in all of our membership records * that the query applies to for the interface on * which the query arrived excl. those that belong * to the "all-hosts" group (224.0.0.1). * - Restart any timer that is already running but has * a value longer than the requested timeout. * - Use the value specified in the query message as * the maximum timeout. */ IN_FIRST_MULTI(step, inm); while (inm != NULL) { if (inm->inm_ifp == ifp && inm->inm_addr.s_addr != igmp_all_hosts_group && (igmp->igmp_group.s_addr == 0 || igmp->igmp_group.s_addr == inm->inm_addr.s_addr)) { if (inm->inm_timer == 0 || inm->inm_timer > timer) { inm->inm_timer = IGMP_RANDOM_DELAY(timer); igmp_timers_are_running = 1; } } IN_NEXT_MULTI(step, inm); } break; case IGMP_V1_MEMBERSHIP_REPORT: case IGMP_V2_MEMBERSHIP_REPORT: /* * For fast leave to work, we have to know that we are the * last person to send a report for this group. Reports * can potentially get looped back if we are a multicast * router, so discard reports sourced by me. */ IFP_TO_IA(ifp, ia); if (ia && ip->ip_src.s_addr == IA_SIN(ia)->sin_addr.s_addr) break; ++igmpstat.igps_rcv_reports; if (ifp->if_flags & IFF_LOOPBACK) break; if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { ++igmpstat.igps_rcv_badreports; m_freem(m); return; } /* * KLUDGE: if the IP source address of the report has an * unspecified (i.e., zero) subnet number, as is allowed for * a booting host, replace it with the correct subnet number * so that a process-level multicast routing daemon can * determine which subnet it arrived from. This is necessary * to compensate for the lack of any way for a process to * determine the arrival interface of an incoming packet. */ if ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) == 0) if (ia) ip->ip_src.s_addr = htonl(ia->ia_subnet); /* * If we belong to the group being reported, stop * our timer for that group. */ IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm); if (inm != NULL) { inm->inm_timer = 0; ++igmpstat.igps_rcv_ourreports; inm->inm_state = IGMP_OTHERMEMBER; } break; } /* * Pass all valid IGMP packets up to any process(es) listening * on a raw IGMP socket. */ rip_input(m, off); } void igmp_joingroup(inm) struct in_multi *inm; { int s = splnet(); if (inm->inm_addr.s_addr == igmp_all_hosts_group || inm->inm_ifp->if_flags & IFF_LOOPBACK) { inm->inm_timer = 0; inm->inm_state = IGMP_OTHERMEMBER; } else { inm->inm_rti = find_rti(inm->inm_ifp); igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_MAX_HOST_REPORT_DELAY*PR_FASTHZ); inm->inm_state = IGMP_IREPORTEDLAST; igmp_timers_are_running = 1; } splx(s); } void igmp_leavegroup(inm) struct in_multi *inm; { if (inm->inm_state == IGMP_IREPORTEDLAST && inm->inm_addr.s_addr != igmp_all_hosts_group && !(inm->inm_ifp->if_flags & IFF_LOOPBACK) && inm->inm_rti->rti_type != IGMP_V1_ROUTER) igmp_sendpkt(inm, IGMP_V2_LEAVE_GROUP, igmp_all_rtrs_group); } void igmp_fasttimo() { register struct in_multi *inm; struct in_multistep step; int s; /* * Quick check to see if any work needs to be done, in order * to minimize the overhead of fasttimo processing. */ if (!igmp_timers_are_running) return; s = splnet(); igmp_timers_are_running = 0; IN_FIRST_MULTI(step, inm); while (inm != NULL) { if (inm->inm_timer == 0) { /* do nothing */ } else if (--inm->inm_timer == 0) { igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); inm->inm_state = IGMP_IREPORTEDLAST; } else { igmp_timers_are_running = 1; } IN_NEXT_MULTI(step, inm); } splx(s); } void igmp_slowtimo() { int s = splnet(); register struct router_info *rti = Head; #ifdef IGMP_DEBUG printf("[igmp.c,_slowtimo] -- > entering \n"); #endif while (rti) { if (rti->rti_type == IGMP_V1_ROUTER) { rti->rti_time++; if (rti->rti_time >= IGMP_AGE_THRESHOLD) { rti->rti_type = IGMP_V2_ROUTER; } } rti = rti->rti_next; } #ifdef IGMP_DEBUG printf("[igmp.c,_slowtimo] -- > exiting \n"); #endif splx(s); } static struct route igmprt; static void igmp_sendpkt(inm, type, addr) struct in_multi *inm; int type; unsigned long addr; { struct mbuf *m; struct igmp *igmp; struct ip *ip; struct ip_moptions imo; MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) return; m->m_pkthdr.rcvif = loif; #ifdef MAC mac_create_mbuf_linklayer(inm->inm_ifp, m); #endif m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN; MH_ALIGN(m, IGMP_MINLEN + sizeof(struct ip)); m->m_data += sizeof(struct ip); m->m_len = IGMP_MINLEN; igmp = mtod(m, struct igmp *); igmp->igmp_type = type; igmp->igmp_code = 0; igmp->igmp_group = inm->inm_addr; igmp->igmp_cksum = 0; igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = sizeof(struct ip) + IGMP_MINLEN; ip->ip_off = 0; ip->ip_p = IPPROTO_IGMP; ip->ip_src.s_addr = INADDR_ANY; ip->ip_dst.s_addr = addr ? addr : igmp->igmp_group.s_addr; imo.imo_multicast_ifp = inm->inm_ifp; imo.imo_multicast_ttl = 1; imo.imo_multicast_vif = -1; /* * Request loopback of the report if we are acting as a multicast * router, so that the process-level routing daemon can hear it. */ imo.imo_multicast_loop = (ip_mrouter != NULL); /* * XXX * Do we have to worry about reentrancy here? Don't think so. */ - ip_output(m, router_alert, &igmprt, 0, &imo); + ip_output(m, router_alert, &igmprt, 0, &imo, NULL); ++igmpstat.igps_snd_reports; } Index: head/sys/netinet/in_gif.c =================================================================== --- head/sys/netinet/in_gif.c (revision 105193) +++ head/sys/netinet/in_gif.c (revision 105194) @@ -1,354 +1,354 @@ /* $FreeBSD$ */ /* $KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_mrouting.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #ifdef MROUTING #include #endif /* MROUTING */ #include #include static int ip_gif_ttl = GIF_TTL; SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW, &ip_gif_ttl, 0, ""); int in_gif_output(ifp, family, m, rt) struct ifnet *ifp; int family; struct mbuf *m; struct rtentry *rt; { struct gif_softc *sc = (struct gif_softc*)ifp; struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst; struct sockaddr_in *sin_src = (struct sockaddr_in *)sc->gif_psrc; struct sockaddr_in *sin_dst = (struct sockaddr_in *)sc->gif_pdst; struct ip iphdr; /* capsule IP header, host byte ordered */ int proto, error; u_int8_t tos; if (sin_src == NULL || sin_dst == NULL || sin_src->sin_family != AF_INET || sin_dst->sin_family != AF_INET) { m_freem(m); return EAFNOSUPPORT; } switch (family) { #ifdef INET case AF_INET: { struct ip *ip; proto = IPPROTO_IPV4; if (m->m_len < sizeof(*ip)) { m = m_pullup(m, sizeof(*ip)); if (!m) return ENOBUFS; } ip = mtod(m, struct ip *); tos = ip->ip_tos; break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { struct ip6_hdr *ip6; proto = IPPROTO_IPV6; if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) return ENOBUFS; } ip6 = mtod(m, struct ip6_hdr *); tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; break; } #endif /* INET6 */ default: #ifdef DEBUG printf("in_gif_output: warning: unknown family %d passed\n", family); #endif m_freem(m); return EAFNOSUPPORT; } bzero(&iphdr, sizeof(iphdr)); iphdr.ip_src = sin_src->sin_addr; /* bidirectional configured tunnel mode */ if (sin_dst->sin_addr.s_addr != INADDR_ANY) iphdr.ip_dst = sin_dst->sin_addr; else { m_freem(m); return ENETUNREACH; } iphdr.ip_p = proto; /* version will be set in ip_output() */ iphdr.ip_ttl = ip_gif_ttl; iphdr.ip_len = m->m_pkthdr.len + sizeof(struct ip); if (ifp->if_flags & IFF_LINK1) ip_ecn_ingress(ECN_ALLOWED, &iphdr.ip_tos, &tos); else ip_ecn_ingress(ECN_NOCARE, &iphdr.ip_tos, &tos); /* prepend new IP header */ M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); if (m && m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { printf("ENOBUFS in in_gif_output %d\n", __LINE__); return ENOBUFS; } bcopy(&iphdr, mtod(m, struct ip *), sizeof(struct ip)); if (dst->sin_family != sin_dst->sin_family || dst->sin_addr.s_addr != sin_dst->sin_addr.s_addr) { /* cache route doesn't match */ dst->sin_family = sin_dst->sin_family; dst->sin_len = sizeof(struct sockaddr_in); dst->sin_addr = sin_dst->sin_addr; if (sc->gif_ro.ro_rt) { RTFREE(sc->gif_ro.ro_rt); sc->gif_ro.ro_rt = NULL; } #if 0 sc->gif_if.if_mtu = GIF_MTU; #endif } if (sc->gif_ro.ro_rt == NULL) { rtalloc(&sc->gif_ro); if (sc->gif_ro.ro_rt == NULL) { m_freem(m); return ENETUNREACH; } /* if it constitutes infinite encapsulation, punt. */ if (sc->gif_ro.ro_rt->rt_ifp == ifp) { m_freem(m); return ENETUNREACH; /* XXX */ } #if 0 ifp->if_mtu = sc->gif_ro.ro_rt->rt_ifp->if_mtu - sizeof(struct ip); #endif } - error = ip_output(m, NULL, &sc->gif_ro, 0, NULL); + error = ip_output(m, NULL, &sc->gif_ro, 0, NULL, NULL); return(error); } void in_gif_input(m, off) struct mbuf *m; int off; { struct ifnet *gifp = NULL; struct ip *ip; int af; u_int8_t otos; int proto; ip = mtod(m, struct ip *); proto = ip->ip_p; gifp = (struct ifnet *)encap_getarg(m); if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { m_freem(m); ipstat.ips_nogif++; return; } otos = ip->ip_tos; m_adj(m, off); switch (proto) { #ifdef INET case IPPROTO_IPV4: { struct ip *ip; af = AF_INET; if (m->m_len < sizeof(*ip)) { m = m_pullup(m, sizeof(*ip)); if (!m) return; } ip = mtod(m, struct ip *); if (gifp->if_flags & IFF_LINK1) ip_ecn_egress(ECN_ALLOWED, &otos, &ip->ip_tos); else ip_ecn_egress(ECN_NOCARE, &otos, &ip->ip_tos); break; } #endif #ifdef INET6 case IPPROTO_IPV6: { struct ip6_hdr *ip6; u_int8_t itos; af = AF_INET6; if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) return; } ip6 = mtod(m, struct ip6_hdr *); itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; if (gifp->if_flags & IFF_LINK1) ip_ecn_egress(ECN_ALLOWED, &otos, &itos); else ip_ecn_egress(ECN_NOCARE, &otos, &itos); ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t)itos << 20); break; } #endif /* INET6 */ default: ipstat.ips_nogif++; m_freem(m); return; } gif_input(m, af, gifp); return; } /* * we know that we are in IFF_UP, outer address available, and outer family * matched the physical addr family. see gif_encapcheck(). */ int gif_encapcheck4(m, off, proto, arg) const struct mbuf *m; int off; int proto; void *arg; { struct ip ip; struct gif_softc *sc; struct sockaddr_in *src, *dst; int addrmatch; struct in_ifaddr *ia4; /* sanity check done in caller */ sc = (struct gif_softc *)arg; src = (struct sockaddr_in *)sc->gif_psrc; dst = (struct sockaddr_in *)sc->gif_pdst; /* LINTED const cast */ m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); /* check for address match */ addrmatch = 0; if (src->sin_addr.s_addr == ip.ip_dst.s_addr) addrmatch |= 1; if (dst->sin_addr.s_addr == ip.ip_src.s_addr) addrmatch |= 2; if (addrmatch != 3) return 0; /* martian filters on outer source - NOT done in ip_input! */ if (IN_MULTICAST(ntohl(ip.ip_src.s_addr))) return 0; switch ((ntohl(ip.ip_src.s_addr) & 0xff000000) >> 24) { case 0: case 127: case 255: return 0; } /* reject packets with broadcast on source */ TAILQ_FOREACH(ia4, &in_ifaddrhead, ia_link) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; if (ip.ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) return 0; } /* ingress filters on outer source */ if ((sc->gif_if.if_flags & IFF_LINK2) == 0 && (m->m_flags & M_PKTHDR) != 0 && m->m_pkthdr.rcvif) { struct sockaddr_in sin; struct rtentry *rt; bzero(&sin, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr = ip.ip_src; rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL); if (!rt || rt->rt_ifp != m->m_pkthdr.rcvif) { #if 0 log(LOG_WARNING, "%s: packet from 0x%x dropped " "due to ingress filter\n", if_name(&sc->gif_if), (u_int32_t)ntohl(sin.sin_addr.s_addr)); #endif if (rt) rtfree(rt); return 0; } rtfree(rt); } return 32 * 2; } Index: head/sys/netinet/ip_divert.c =================================================================== --- head/sys/netinet/ip_divert.c (revision 105193) +++ head/sys/netinet/ip_divert.c (revision 105194) @@ -1,568 +1,568 @@ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_inet.h" #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_ipsec.h" #include "opt_mac.h" #ifndef INET #error "IPDIVERT requires INET." #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Divert sockets */ /* * Allocate enough space to hold a full IP packet */ #define DIVSNDQ (65536 + 100) #define DIVRCVQ (65536 + 100) /* * Divert sockets work in conjunction with ipfw, see the divert(4) * manpage for features. * Internally, packets selected by ipfw in ip_input() or ip_output(), * and never diverted before, are passed to the input queue of the * divert socket with a given 'divert_port' number (as specified in * the matching ipfw rule), and they are tagged with a 16 bit cookie * (representing the rule number of the matching ipfw rule), which * is passed to process reading from the socket. * * Packets written to the divert socket are again tagged with a cookie * (usually the same as above) and a destination address. * If the destination address is INADDR_ANY then the packet is * treated as outgoing and sent to ip_output(), otherwise it is * treated as incoming and sent to ip_input(). * In both cases, the packet is tagged with the cookie. * * On reinjection, processing in ip_input() and ip_output() * will be exactly the same as for the original packet, except that * ipfw processing will start at the rule number after the one * written in the cookie (so, tagging a packet with a cookie of 0 * will cause it to be effectively considered as a standard packet). */ /* Internal variables */ static struct inpcbhead divcb; static struct inpcbinfo divcbinfo; static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ /* Optimization: have this preinitialized */ static struct sockaddr_in divsrc = { sizeof(divsrc), AF_INET }; /* * Initialize divert connection block queue. */ void div_init(void) { INP_INFO_LOCK_INIT(&divcbinfo, "div"); LIST_INIT(&divcb); divcbinfo.listhead = &divcb; /* * XXX We don't use the hash list for divert IP, but it's easier * to allocate a one entry hash list than it is to check all * over the place for hashbase == NULL. */ divcbinfo.hashbase = hashinit(1, M_PCB, &divcbinfo.hashmask); divcbinfo.porthashbase = hashinit(1, M_PCB, &divcbinfo.porthashmask); divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(divcbinfo.ipi_zone, maxsockets); } /* * IPPROTO_DIVERT is not a real IP protocol; don't allow any packets * with that protocol number to enter the system from the outside. */ void div_input(struct mbuf *m, int off) { ipstat.ips_noproto++; m_freem(m); } /* * Divert a packet by passing it up to the divert socket at port 'port'. * * Setup generic address and protocol structures for div_input routine, * then pass them along with mbuf chain. */ void divert_packet(struct mbuf *m, int incoming, int port, int rule) { struct ip *ip; struct inpcb *inp; struct socket *sa; u_int16_t nport; /* Sanity check */ KASSERT(port != 0, ("%s: port=0", __func__)); divsrc.sin_port = rule; /* record matching rule */ /* Assure header */ if (m->m_len < sizeof(struct ip) && (m = m_pullup(m, sizeof(struct ip))) == 0) return; ip = mtod(m, struct ip *); /* * Record receive interface address, if any. * But only for incoming packets. */ divsrc.sin_addr.s_addr = 0; if (incoming) { struct ifaddr *ifa; /* Sanity check */ KASSERT((m->m_flags & M_PKTHDR), ("%s: !PKTHDR", __func__)); /* Find IP address for receive interface */ TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL) continue; if (ifa->ifa_addr->sa_family != AF_INET) continue; divsrc.sin_addr = ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; break; } } /* * Record the incoming interface name whenever we have one. */ bzero(&divsrc.sin_zero, sizeof(divsrc.sin_zero)); if (m->m_pkthdr.rcvif) { /* * Hide the actual interface name in there in the * sin_zero array. XXX This needs to be moved to a * different sockaddr type for divert, e.g. * sockaddr_div with multiple fields like * sockaddr_dl. Presently we have only 7 bytes * but that will do for now as most interfaces * are 4 or less + 2 or less bytes for unit. * There is probably a faster way of doing this, * possibly taking it from the sockaddr_dl on the iface. * This solves the problem of a P2P link and a LAN interface * having the same address, which can result in the wrong * interface being assigned to the packet when fed back * into the divert socket. Theoretically if the daemon saves * and re-uses the sockaddr_in as suggested in the man pages, * this iface name will come along for the ride. * (see div_output for the other half of this.) */ snprintf(divsrc.sin_zero, sizeof(divsrc.sin_zero), "%s%d", m->m_pkthdr.rcvif->if_name, m->m_pkthdr.rcvif->if_unit); } /* Put packet on socket queue, if any */ sa = NULL; nport = htons((u_int16_t)port); LIST_FOREACH(inp, &divcb, inp_list) { if (inp->inp_lport == nport) sa = inp->inp_socket; } if (sa) { if (sbappendaddr(&sa->so_rcv, (struct sockaddr *)&divsrc, m, (struct mbuf *)0) == 0) m_freem(m); else sorwakeup(sa); } else { m_freem(m); ipstat.ips_noproto++; ipstat.ips_delivered--; } } /* * Deliver packet back into the IP processing machinery. * * If no address specified, or address is 0.0.0.0, send to ip_output(); * otherwise, send to ip_input() and mark as having been received on * the interface with that address. */ static int div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, struct mbuf *control) { int error = 0; struct m_hdr divert_tag; /* * Prepare the tag for divert info. Note that a packet * with a 0 tag in mh_data is effectively untagged, * so we could optimize that case. */ divert_tag.mh_type = MT_TAG; divert_tag.mh_flags = PACKET_TAG_DIVERT; divert_tag.mh_next = m; divert_tag.mh_data = 0; /* the matching rule # */ m->m_pkthdr.rcvif = NULL; /* XXX is it necessary ? */ #ifdef MAC mac_create_mbuf_from_socket(so, m); #endif if (control) m_freem(control); /* XXX */ /* Loopback avoidance and state recovery */ if (sin) { int i; divert_tag.mh_data = (caddr_t)(int)sin->sin_port; /* * Find receive interface with the given name, stuffed * (if it exists) in the sin_zero[] field. * The name is user supplied data so don't trust its size * or that it is zero terminated. */ for (i = 0; sin->sin_zero[i] && i < sizeof(sin->sin_zero); i++) ; if ( i > 0 && i < sizeof(sin->sin_zero)) m->m_pkthdr.rcvif = ifunit(sin->sin_zero); } /* Reinject packet into the system as incoming or outgoing */ if (!sin || sin->sin_addr.s_addr == 0) { struct inpcb *const inp = sotoinpcb(so); struct ip *const ip = mtod(m, struct ip *); /* * Don't allow both user specified and setsockopt options, * and don't allow packet length sizes that will crash */ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { error = EINVAL; goto cantsend; } /* Convert fields to host order for ip_output() */ ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); /* Send packet to output processing */ ipstat.ips_rawout++; /* XXX */ error = ip_output((struct mbuf *)&divert_tag, inp->inp_options, &inp->inp_route, (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST | IP_RAWOUTPUT, - inp->inp_moptions); + inp->inp_moptions, NULL); } else { if (m->m_pkthdr.rcvif == NULL) { /* * No luck with the name, check by IP address. * Clear the port and the ifname to make sure * there are no distractions for ifa_ifwithaddr. */ struct ifaddr *ifa; bzero(sin->sin_zero, sizeof(sin->sin_zero)); sin->sin_port = 0; ifa = ifa_ifwithaddr((struct sockaddr *) sin); if (ifa == NULL) { error = EADDRNOTAVAIL; goto cantsend; } m->m_pkthdr.rcvif = ifa->ifa_ifp; } /* Send packet to input processing */ ip_input((struct mbuf *)&divert_tag); } return error; cantsend: m_freem(m); return error; } static int div_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error, s; inp = sotoinpcb(so); if (inp) panic("div_attach"); if (td && (error = suser(td)) != 0) return error; error = soreserve(so, div_sendspace, div_recvspace); if (error) return error; s = splnet(); error = in_pcballoc(so, &divcbinfo, td); splx(s); if (error) return error; inp = (struct inpcb *)so->so_pcb; inp->inp_ip_p = proto; inp->inp_vflag |= INP_IPV4; inp->inp_flags |= INP_HDRINCL; /* The socket is always "connected" because we always know "where" to send the packet */ so->so_state |= SS_ISCONNECTED; return 0; } static int div_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); if (inp == 0) panic("div_detach"); in_pcbdetach(inp); return 0; } static int div_abort(struct socket *so) { soisdisconnected(so); return div_detach(so); } static int div_disconnect(struct socket *so) { if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN; return div_abort(so); } static int div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int s; int error; s = splnet(); inp = sotoinpcb(so); /* in_pcbbind assumes that nam is a sockaddr_in * and in_pcbbind requires a valid address. Since divert * sockets don't we need to make sure the address is * filled in properly. * XXX -- divert should not be abusing in_pcbind * and should probably have its own family. */ if (nam->sa_family != AF_INET) error = EAFNOSUPPORT; else { ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; error = in_pcbbind(inp, nam, td); } splx(s); return error; } static int div_shutdown(struct socket *so) { socantsendmore(so); return 0; } static int div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { /* Packet must have a header (but that's about it) */ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == 0) { ipstat.ips_toosmall++; m_freem(m); return EINVAL; } /* Send packet */ return div_output(so, m, (struct sockaddr_in *)nam, control); } static int div_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n, s; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = divcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ s = splnet(); gencnt = divcbinfo.ipi_gencnt; n = divcbinfo.ipi_count; splx(s); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return ENOMEM; s = splnet(); for (inp = LIST_FIRST(divcbinfo.listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->td, inp)) inp_list[i++] = inp; } splx(s); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); error = SYSCTL_OUT(req, &xi, sizeof xi); } } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ s = splnet(); xig.xig_gen = divcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = divcbinfo.ipi_count; splx(s); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } /* * This is the wrapper function for in_setsockaddr. We just pass down * the pcbinfo for in_setpeeraddr to lock. */ static int div_sockaddr(struct socket *so, struct sockaddr **nam) { return (in_setsockaddr(so, nam, &divcbinfo)); } /* * This is the wrapper function for in_setpeeraddr. We just pass down * the pcbinfo for in_setpeeraddr to lock. */ static int div_peeraddr(struct socket *so, struct sockaddr **nam) { return (in_setpeeraddr(so, nam, &divcbinfo)); } SYSCTL_DECL(_net_inet_divert); SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0, div_pcblist, "S,xinpcb", "List of active divert sockets"); struct pr_usrreqs div_usrreqs = { div_abort, pru_accept_notsupp, div_attach, div_bind, pru_connect_notsupp, pru_connect2_notsupp, in_control, div_detach, div_disconnect, pru_listen_notsupp, div_peeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, div_send, pru_sense_null, div_shutdown, div_sockaddr, sosend, soreceive, sopoll }; Index: head/sys/netinet/ip_dummynet.c =================================================================== --- head/sys/netinet/ip_dummynet.c (revision 105193) +++ head/sys/netinet/ip_dummynet.c (revision 105194) @@ -1,1971 +1,1971 @@ /* * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa * Portions Copyright (c) 2000 Akamba Corp. * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #define DEB(x) #define DDB(x) x /* * This module implements IP dummynet, a bandwidth limiter/delay emulator * used in conjunction with the ipfw package. * Description of the data structures used is in ip_dummynet.h * Here you mainly find the following blocks of code: * + variable declarations; * + heap management functions; * + scheduler and dummynet functions; * + configuration and initialization. * * NOTA BENE: critical sections are protected by splimp()/splx() * pairs. One would think that splnet() is enough as for most of * the netinet code, but it is not so because when used with * bridging, dummynet is invoked at splimp(). * * Most important Changes: * * 011004: KLDable * 010124: Fixed WF2Q behaviour * 010122: Fixed spl protection. * 000601: WF2Q support * 000106: large rewrite, use heaps to handle very many pipes. * 980513: initial release * * include files marked with XXX are probably not needed */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for struct arpcom */ #include /* * We keep a private variable for the simulation time, but we could * probably use an existing one ("softticks" in sys/kern/kern_timer.c) */ static dn_key curr_time = 0 ; /* current simulation time */ static int dn_hash_size = 64 ; /* default hash size */ /* statistics on number of queue searches and search steps */ static int searches, search_steps ; static int pipe_expire = 1 ; /* expire queue if empty */ static int dn_max_ratio = 16 ; /* max queues/buckets ratio */ static int red_lookup_depth = 256; /* RED - default lookup table depth */ static int red_avg_pkt_size = 512; /* RED - default medium packet size */ static int red_max_pkt_size = 1500; /* RED - default max packet size */ /* * Three heaps contain queues and pipes that the scheduler handles: * * ready_heap contains all dn_flow_queue related to fixed-rate pipes. * * wfq_ready_heap contains the pipes associated with WF2Q flows * * extract_heap contains pipes associated with delay lines. * */ MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ; static int heap_init(struct dn_heap *h, int size) ; static int heap_insert (struct dn_heap *h, dn_key key1, void *p); static void heap_extract(struct dn_heap *h, void *obj); static void transmit_event(struct dn_pipe *pipe); static void ready_event(struct dn_flow_queue *q); static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */ static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */ static struct callout_handle dn_timeout; #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time, CTLFLAG_RD, &curr_time, 0, "Current tick"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap, CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap, CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches, CTLFLAG_RD, &searches, 0, "Number of queue searches"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps, CTLFLAG_RD, &search_steps, 0, "Number of queue search steps"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, CTLFLAG_RW, &dn_max_ratio, 0, "Max ratio between dynamic queues and buckets"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size"); #endif static int config_pipe(struct dn_pipe *p); static int ip_dn_ctl(struct sockopt *sopt); static void rt_unref(struct rtentry *); static void dummynet(void *); static void dummynet_flush(void); void dummynet_drain(void); static ip_dn_io_t dummynet_io; static void dn_rule_delete(void *); int if_tx_rdy(struct ifnet *ifp); static void rt_unref(struct rtentry *rt) { if (rt == NULL) return ; if (rt->rt_refcnt <= 0) printf("-- warning, refcnt now %ld, decreasing\n", rt->rt_refcnt); RTFREE(rt); } /* * Heap management functions. * * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. * Some macros help finding parent/children so we can optimize them. * * heap_init() is called to expand the heap when needed. * Increment size in blocks of 16 entries. * XXX failure to allocate a new element is a pretty bad failure * as we basically stall a whole queue forever!! * Returns 1 on error, 0 on success */ #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) #define HEAP_LEFT(x) ( 2*(x) + 1 ) #define HEAP_IS_LEFT(x) ( (x) & 1 ) #define HEAP_RIGHT(x) ( 2*(x) + 2 ) #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } #define HEAP_INCREMENT 15 static int heap_init(struct dn_heap *h, int new_size) { struct dn_heap_entry *p; if (h->size >= new_size ) { printf("heap_init, Bogus call, have %d want %d\n", h->size, new_size); return 0 ; } new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT ); if (p == NULL) { printf(" heap_init, resize %d failed\n", new_size ); return 1 ; /* error */ } if (h->size > 0) { bcopy(h->p, p, h->size * sizeof(*p) ); free(h->p, M_DUMMYNET); } h->p = p ; h->size = new_size ; return 0 ; } /* * Insert element in heap. Normally, p != NULL, we insert p in * a new position and bubble up. If p == NULL, then the element is * already in place, and key is the position where to start the * bubble-up. * Returns 1 on failure (cannot allocate new heap entry) * * If offset > 0 the position (index, int) of the element in the heap is * also stored in the element itself at the given offset in bytes. */ #define SET_OFFSET(heap, node) \ if (heap->offset > 0) \ *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ; /* * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value. */ #define RESET_OFFSET(heap, node) \ if (heap->offset > 0) \ *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ; static int heap_insert(struct dn_heap *h, dn_key key1, void *p) { int son = h->elements ; if (p == NULL) /* data already there, set starting point */ son = key1 ; else { /* insert new element at the end, possibly resize */ son = h->elements ; if (son == h->size) /* need resize... */ if (heap_init(h, h->elements+1) ) return 1 ; /* failure... */ h->p[son].object = p ; h->p[son].key = key1 ; h->elements++ ; } while (son > 0) { /* bubble up */ int father = HEAP_FATHER(son) ; struct dn_heap_entry tmp ; if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) break ; /* found right position */ /* son smaller than father, swap and repeat */ HEAP_SWAP(h->p[son], h->p[father], tmp) ; SET_OFFSET(h, son); son = father ; } SET_OFFSET(h, son); return 0 ; } /* * remove top element from heap, or obj if obj != NULL */ static void heap_extract(struct dn_heap *h, void *obj) { int child, father, max = h->elements - 1 ; if (max < 0) { printf("warning, extract from empty heap 0x%p\n", h); return ; } father = 0 ; /* default: move up smallest child */ if (obj != NULL) { /* extract specific element, index is at offset */ if (h->offset <= 0) panic("*** heap_extract from middle not supported on this heap!!!\n"); father = *((int *)((char *)obj + h->offset)) ; if (father < 0 || father >= h->elements) { printf("dummynet: heap_extract, father %d out of bound 0..%d\n", father, h->elements); panic("heap_extract"); } } RESET_OFFSET(h, father); child = HEAP_LEFT(father) ; /* left child */ while (child <= max) { /* valid entry */ if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) child = child+1 ; /* take right child, otherwise left */ h->p[father] = h->p[child] ; SET_OFFSET(h, father); father = child ; child = HEAP_LEFT(child) ; /* left child for next loop */ } h->elements-- ; if (father != max) { /* * Fill hole with last entry and bubble up, reusing the insert code */ h->p[father] = h->p[max] ; heap_insert(h, father, NULL); /* this one cannot fail */ } } #if 0 /* * change object position and update references * XXX this one is never used! */ static void heap_move(struct dn_heap *h, dn_key new_key, void *object) { int temp; int i ; int max = h->elements-1 ; struct dn_heap_entry buf ; if (h->offset <= 0) panic("cannot move items on this heap"); i = *((int *)((char *)object + h->offset)); if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */ h->p[i].key = new_key ; for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ; i = temp ) { /* bubble up */ HEAP_SWAP(h->p[i], h->p[temp], buf) ; SET_OFFSET(h, i); } } else { /* must move down */ h->p[i].key = new_key ; while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */ if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key)) temp++ ; /* select child with min key */ if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */ HEAP_SWAP(h->p[i], h->p[temp], buf) ; SET_OFFSET(h, i); } else break ; i = temp ; } } SET_OFFSET(h, i); } #endif /* heap_move, unused */ /* * heapify() will reorganize data inside an array to maintain the * heap property. It is needed when we delete a bunch of entries. */ static void heapify(struct dn_heap *h) { int i ; for (i = 0 ; i < h->elements ; i++ ) heap_insert(h, i , NULL) ; } /* * cleanup the heap and free data structure */ static void heap_free(struct dn_heap *h) { if (h->size >0 ) free(h->p, M_DUMMYNET); bzero(h, sizeof(*h) ); } /* * --- end of heap management functions --- */ /* * Scheduler functions: * * transmit_event() is called when the delay-line needs to enter * the scheduler, either because of existing pkts getting ready, * or new packets entering the queue. The event handled is the delivery * time of the packet. * * ready_event() does something similar with fixed-rate queues, and the * event handled is the finish time of the head pkt. * * wfq_ready_event() does something similar with WF2Q queues, and the * event handled is the start time of the head pkt. * * In all cases, we make sure that the data structures are consistent * before passing pkts out, because this might trigger recursive * invocations of the procedures. */ static void transmit_event(struct dn_pipe *pipe) { struct dn_pkt *pkt ; while ( (pkt = pipe->head) && DN_KEY_LEQ(pkt->output_time, curr_time) ) { /* * first unlink, then call procedures, since ip_input() can invoke * ip_output() and viceversa, thus causing nested calls */ pipe->head = DN_NEXT(pkt) ; /* * The actual mbuf is preceded by a struct dn_pkt, resembling an mbuf * (NOT A REAL one, just a small block of malloc'ed memory) with * m_type = MT_TAG, m_flags = PACKET_TAG_DUMMYNET * dn_m (m_next) = actual mbuf to be processed by ip_input/output * and some other fields. * The block IS FREED HERE because it contains parameters passed * to the called routine. */ switch (pkt->dn_dir) { case DN_TO_IP_OUT: - (void)ip_output((struct mbuf *)pkt, NULL, NULL, 0, NULL); + (void)ip_output((struct mbuf *)pkt, NULL, NULL, 0, NULL, NULL); rt_unref (pkt->ro.ro_rt) ; break ; case DN_TO_IP_IN : ip_input((struct mbuf *)pkt) ; break ; case DN_TO_BDG_FWD : if (!BDG_LOADED) { /* somebody unloaded the bridge module. Drop pkt */ printf("-- dropping bridged packet trapped in pipe--\n"); m_freem(pkt->dn_m); break; } /* fallthrough */ case DN_TO_ETH_DEMUX: { struct mbuf *m = (struct mbuf *)pkt ; struct ether_header *eh; if (pkt->dn_m->m_len < ETHER_HDR_LEN && (pkt->dn_m = m_pullup(pkt->dn_m, ETHER_HDR_LEN)) == NULL) { printf("dummynet/bridge: pullup fail, dropping pkt\n"); break; } /* * same as ether_input, make eh be a pointer into the mbuf */ eh = mtod(pkt->dn_m, struct ether_header *); m_adj(pkt->dn_m, ETHER_HDR_LEN); /* * bdg_forward() wants a pointer to the pseudo-mbuf-header, but * on return it will supply the pointer to the actual packet * (originally pkt->dn_m, but could be something else now) if * it has not consumed it. */ if (pkt->dn_dir == DN_TO_BDG_FWD) { m = bdg_forward_ptr(m, eh, pkt->ifp); if (m) m_freem(m); } else ether_demux(NULL, eh, m); /* which consumes the mbuf */ } break ; case DN_TO_ETH_OUT: ether_output_frame(pkt->ifp, (struct mbuf *)pkt); break; default: printf("dummynet: bad switch %d!\n", pkt->dn_dir); m_freem(pkt->dn_m); break ; } free(pkt, M_DUMMYNET); } /* if there are leftover packets, put into the heap for next event */ if ( (pkt = pipe->head) ) heap_insert(&extract_heap, pkt->output_time, pipe ) ; /* XXX should check errors on heap_insert, by draining the * whole pipe p and hoping in the future we are more successful */ } /* * the following macro computes how many ticks we have to wait * before being able to transmit a packet. The credit is taken from * either a pipe (WF2Q) or a flow_queue (per-flow queueing) */ #define SET_TICKS(pkt, q, p) \ (pkt->dn_m->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \ p->bandwidth ; /* * extract pkt from queue, compute output time (could be now) * and put into delay line (p_queue) */ static void move_pkt(struct dn_pkt *pkt, struct dn_flow_queue *q, struct dn_pipe *p, int len) { q->head = DN_NEXT(pkt) ; q->len-- ; q->len_bytes -= len ; pkt->output_time = curr_time + p->delay ; if (p->head == NULL) p->head = pkt; else DN_NEXT(p->tail) = pkt; p->tail = pkt; DN_NEXT(p->tail) = NULL; } /* * ready_event() is invoked every time the queue must enter the * scheduler, either because the first packet arrives, or because * a previously scheduled event fired. * On invokation, drain as many pkts as possible (could be 0) and then * if there are leftover packets reinsert the pkt in the scheduler. */ static void ready_event(struct dn_flow_queue *q) { struct dn_pkt *pkt; struct dn_pipe *p = q->fs->pipe ; int p_was_empty ; if (p == NULL) { printf("ready_event- pipe is gone\n"); return ; } p_was_empty = (p->head == NULL) ; /* * schedule fixed-rate queues linked to this pipe: * Account for the bw accumulated since last scheduling, then * drain as many pkts as allowed by q->numbytes and move to * the delay line (in p) computing output time. * bandwidth==0 (no limit) means we can drain the whole queue, * setting len_scaled = 0 does the job. */ q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth; while ( (pkt = q->head) != NULL ) { int len = pkt->dn_m->m_pkthdr.len; int len_scaled = p->bandwidth ? len*8*hz : 0 ; if (len_scaled > q->numbytes ) break ; q->numbytes -= len_scaled ; move_pkt(pkt, q, p, len); } /* * If we have more packets queued, schedule next ready event * (can only occur when bandwidth != 0, otherwise we would have * flushed the whole queue in the previous loop). * To this purpose we record the current time and compute how many * ticks to go for the finish time of the packet. */ if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */ dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */ q->sched_time = curr_time ; heap_insert(&ready_heap, curr_time + t, (void *)q ); /* XXX should check errors on heap_insert, and drain the whole * queue on error hoping next time we are luckier. */ } else /* RED needs to know when the queue becomes empty */ q->q_time = curr_time; /* * If the delay line was empty call transmit_event(p) now. * Otherwise, the scheduler will take care of it. */ if (p_was_empty) transmit_event(p); } /* * Called when we can transmit packets on WF2Q queues. Take pkts out of * the queues at their start time, and enqueue into the delay line. * Packets are drained until p->numbytes < 0. As long as * len_scaled >= p->numbytes, the packet goes into the delay line * with a deadline p->delay. For the last packet, if p->numbytes<0, * there is an additional delay. */ static void ready_event_wfq(struct dn_pipe *p) { int p_was_empty = (p->head == NULL) ; struct dn_heap *sch = &(p->scheduler_heap); struct dn_heap *neh = &(p->not_eligible_heap) ; if (p->if_name[0] == 0) /* tx clock is simulated */ p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth; else { /* tx clock is for real, the ifq must be empty or this is a NOP */ if (p->ifp && p->ifp->if_snd.ifq_head != NULL) return ; else { DEB(printf("pipe %d ready from %s --\n", p->pipe_nr, p->if_name);) } } /* * While we have backlogged traffic AND credit, we need to do * something on the queue. */ while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) { if (sch->elements > 0) { /* have some eligible pkts to send out */ struct dn_flow_queue *q = sch->p[0].object ; struct dn_pkt *pkt = q->head; struct dn_flow_set *fs = q->fs; u_int64_t len = pkt->dn_m->m_pkthdr.len; int len_scaled = p->bandwidth ? len*8*hz : 0 ; heap_extract(sch, NULL); /* remove queue from heap */ p->numbytes -= len_scaled ; move_pkt(pkt, q, p, len); p->V += (len<sum ; /* update V */ q->S = q->F ; /* update start time */ if (q->len == 0) { /* Flow not backlogged any more */ fs->backlogged-- ; heap_insert(&(p->idle_heap), q->F, q); } else { /* still backlogged */ /* * update F and position in backlogged queue, then * put flow in not_eligible_heap (we will fix this later). */ len = (q->head)->dn_m->m_pkthdr.len; q->F += (len<weight ; if (DN_KEY_LEQ(q->S, p->V)) heap_insert(neh, q->S, q); else heap_insert(sch, q->F, q); } } /* * now compute V = max(V, min(S_i)). Remember that all elements in sch * have by definition S_i <= V so if sch is not empty, V is surely * the max and we must not update it. Conversely, if sch is empty * we only need to look at neh. */ if (sch->elements == 0 && neh->elements > 0) p->V = MAX64 ( p->V, neh->p[0].key ); /* move from neh to sch any packets that have become eligible */ while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) { struct dn_flow_queue *q = neh->p[0].object ; heap_extract(neh, NULL); heap_insert(sch, q->F, q); } if (p->if_name[0] != '\0') {/* tx clock is from a real thing */ p->numbytes = -1 ; /* mark not ready for I/O */ break ; } } if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0 && p->idle_heap.elements > 0) { /* * no traffic and no events scheduled. We can get rid of idle-heap. */ int i ; for (i = 0 ; i < p->idle_heap.elements ; i++) { struct dn_flow_queue *q = p->idle_heap.p[i].object ; q->F = 0 ; q->S = q->F + 1 ; } p->sum = 0 ; p->V = 0 ; p->idle_heap.elements = 0 ; } /* * If we are getting clocks from dummynet (not a real interface) and * If we are under credit, schedule the next ready event. * Also fix the delivery time of the last packet. */ if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */ dn_key t=0 ; /* number of ticks i have to wait */ if (p->bandwidth > 0) t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ; p->tail->output_time += t ; p->sched_time = curr_time ; heap_insert(&wfq_ready_heap, curr_time + t, (void *)p); /* XXX should check errors on heap_insert, and drain the whole * queue on error hoping next time we are luckier. */ } /* * If the delay line was empty call transmit_event(p) now. * Otherwise, the scheduler will take care of it. */ if (p_was_empty) transmit_event(p); } /* * This is called once per tick, or HZ times per second. It is used to * increment the current tick counter and schedule expired events. */ static void dummynet(void * __unused unused) { void *p ; /* generic parameter to handler */ struct dn_heap *h ; int s ; struct dn_heap *heaps[3]; int i; struct dn_pipe *pe ; heaps[0] = &ready_heap ; /* fixed-rate queues */ heaps[1] = &wfq_ready_heap ; /* wfq queues */ heaps[2] = &extract_heap ; /* delay line */ s = splimp(); /* see note on top, splnet() is not enough */ curr_time++ ; for (i=0; i < 3 ; i++) { h = heaps[i]; while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) { DDB(if (h->p[0].key > curr_time) printf("-- dummynet: warning, heap %d is %d ticks late\n", i, (int)(curr_time - h->p[0].key));) p = h->p[0].object ; /* store a copy before heap_extract */ heap_extract(h, NULL); /* need to extract before processing */ if (i == 0) ready_event(p) ; else if (i == 1) { struct dn_pipe *pipe = p; if (pipe->if_name[0] != '\0') printf("*** bad ready_event_wfq for pipe %s\n", pipe->if_name); else ready_event_wfq(p) ; } else transmit_event(p); } } /* sweep pipes trying to expire idle flow_queues */ for (pe = all_pipes; pe ; pe = pe->next ) if (pe->idle_heap.elements > 0 && DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) { struct dn_flow_queue *q = pe->idle_heap.p[0].object ; heap_extract(&(pe->idle_heap), NULL); q->S = q->F + 1 ; /* mark timestamp as invalid */ pe->sum -= q->fs->weight ; } splx(s); dn_timeout = timeout(dummynet, NULL, 1); } /* * called by an interface when tx_rdy occurs. */ int if_tx_rdy(struct ifnet *ifp) { struct dn_pipe *p; for (p = all_pipes; p ; p = p->next ) if (p->ifp == ifp) break ; if (p == NULL) { char buf[32]; sprintf(buf, "%s%d",ifp->if_name, ifp->if_unit); for (p = all_pipes; p ; p = p->next ) if (!strcmp(p->if_name, buf) ) { p->ifp = ifp ; DEB(printf("++ tx rdy from %s (now found)\n", buf);) break ; } } if (p != NULL) { DEB(printf("++ tx rdy from %s%d - qlen %d\n", ifp->if_name, ifp->if_unit, ifp->if_snd.ifq_len);) p->numbytes = 0 ; /* mark ready for I/O */ ready_event_wfq(p); } return 0; } /* * Unconditionally expire empty queues in case of shortage. * Returns the number of queues freed. */ static int expire_queues(struct dn_flow_set *fs) { struct dn_flow_queue *q, *prev ; int i, initial_elements = fs->rq_elements ; if (fs->last_expired == time_second) return 0 ; fs->last_expired = time_second ; for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */ for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) if (q->head != NULL || q->S != q->F+1) { prev = q ; q = q->next ; } else { /* entry is idle, expire it */ struct dn_flow_queue *old_q = q ; if (prev != NULL) prev->next = q = q->next ; else fs->rq[i] = q = q->next ; fs->rq_elements-- ; free(old_q, M_DUMMYNET); } return initial_elements - fs->rq_elements ; } /* * If room, create a new queue and put at head of slot i; * otherwise, create or use the default queue. */ static struct dn_flow_queue * create_queue(struct dn_flow_set *fs, int i) { struct dn_flow_queue *q ; if (fs->rq_elements > fs->rq_size * dn_max_ratio && expire_queues(fs) == 0) { /* * No way to get room, use or create overflow queue. */ i = fs->rq_size ; if ( fs->rq[i] != NULL ) return fs->rq[i] ; } q = malloc(sizeof(*q), M_DUMMYNET, M_DONTWAIT | M_ZERO); if (q == NULL) { printf("sorry, cannot allocate queue for new flow\n"); return NULL ; } q->fs = fs ; q->hash_slot = i ; q->next = fs->rq[i] ; q->S = q->F + 1; /* hack - mark timestamp as invalid */ fs->rq[i] = q ; fs->rq_elements++ ; return q ; } /* * Given a flow_set and a pkt in last_pkt, find a matching queue * after appropriate masking. The queue is moved to front * so that further searches take less time. */ static struct dn_flow_queue * find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id) { int i = 0 ; /* we need i and q for new allocations */ struct dn_flow_queue *q, *prev; if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) ) q = fs->rq[0] ; else { /* first, do the masking */ id->dst_ip &= fs->flow_mask.dst_ip ; id->src_ip &= fs->flow_mask.src_ip ; id->dst_port &= fs->flow_mask.dst_port ; id->src_port &= fs->flow_mask.src_port ; id->proto &= fs->flow_mask.proto ; id->flags = 0 ; /* we don't care about this one */ /* then, hash function */ i = ( (id->dst_ip) & 0xffff ) ^ ( (id->dst_ip >> 15) & 0xffff ) ^ ( (id->src_ip << 1) & 0xffff ) ^ ( (id->src_ip >> 16 ) & 0xffff ) ^ (id->dst_port << 1) ^ (id->src_port) ^ (id->proto ); i = i % fs->rq_size ; /* finally, scan the current list for a match */ searches++ ; for (prev=NULL, q = fs->rq[i] ; q ; ) { search_steps++; if (bcmp(id, &(q->id), sizeof(q->id) ) == 0) break ; /* found */ else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) { /* entry is idle and not in any heap, expire it */ struct dn_flow_queue *old_q = q ; if (prev != NULL) prev->next = q = q->next ; else fs->rq[i] = q = q->next ; fs->rq_elements-- ; free(old_q, M_DUMMYNET); continue ; } prev = q ; q = q->next ; } if (q && prev != NULL) { /* found and not in front */ prev->next = q->next ; q->next = fs->rq[i] ; fs->rq[i] = q ; } } if (q == NULL) { /* no match, need to allocate a new entry */ q = create_queue(fs, i); if (q != NULL) q->id = *id ; } return q ; } static int red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) { /* * RED algorithm * * RED calculates the average queue size (avg) using a low-pass filter * with an exponential weighted (w_q) moving average: * avg <- (1-w_q) * avg + w_q * q_size * where q_size is the queue length (measured in bytes or * packets). * * If q_size == 0, we compute the idle time for the link, and set * avg = (1 - w_q)^(idle/s) * where s is the time needed for transmitting a medium-sized packet. * * Now, if avg < min_th the packet is enqueued. * If avg > max_th the packet is dropped. Otherwise, the packet is * dropped with probability P function of avg. * */ int64_t p_b = 0; /* queue in bytes or packets ? */ u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len; DEB(printf("\n%d q: %2u ", (int) curr_time, q_size);) /* average queue size estimation */ if (q_size != 0) { /* * queue is not empty, avg <- avg + (q_size - avg) * w_q */ int diff = SCALE(q_size) - q->avg; int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q); q->avg += (int) v; } else { /* * queue is empty, find for how long the queue has been * empty and use a lookup table for computing * (1 - * w_q)^(idle_time/s) where s is the time to send a * (small) packet. * XXX check wraps... */ if (q->avg) { u_int t = (curr_time - q->q_time) / fs->lookup_step; q->avg = (t < fs->lookup_depth) ? SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; } } DEB(printf("avg: %u ", SCALE_VAL(q->avg));) /* should i drop ? */ if (q->avg < fs->min_th) { q->count = -1; return 0; /* accept packet ; */ } if (q->avg >= fs->max_th) { /* average queue >= max threshold */ if (fs->flags_fs & DN_IS_GENTLE_RED) { /* * According to Gentle-RED, if avg is greater than max_th the * packet is dropped with a probability * p_b = c_3 * avg - c_4 * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p */ p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4; } else { q->count = -1; printf("- drop"); return 1 ; } } else if (q->avg > fs->min_th) { /* * we compute p_b using the linear dropping function p_b = c_1 * * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 = * max_p * min_th / (max_th - min_th) */ p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2; } if (fs->flags_fs & DN_QSIZE_IS_BYTES) p_b = (p_b * len) / fs->max_pkt_size; if (++q->count == 0) q->random = random() & 0xffff; else { /* * q->count counts packets arrived since last drop, so a greater * value of q->count means a greater packet drop probability. */ if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) { q->count = 0; DEB(printf("- red drop");) /* after a drop we calculate a new random value */ q->random = random() & 0xffff; return 1; /* drop */ } } /* end of RED algorithm */ return 0 ; /* accept */ } static __inline struct dn_flow_set * locate_flowset(int pipe_nr, struct ip_fw *rule) { #if IPFW2 struct dn_flow_set *fs; ipfw_insn *cmd = rule->cmd + rule->act_ofs; if (cmd->opcode == O_LOG) cmd += F_LEN(cmd); fs = ((ipfw_insn_pipe *)cmd)->pipe_ptr; if (fs != NULL) return fs; if (cmd->opcode == O_QUEUE) #else /* !IPFW2 */ struct dn_flow_set *fs = NULL ; if ( (rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_QUEUE ) #endif /* !IPFW2 */ for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next) ; else { struct dn_pipe *p1; for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next) ; if (p1 != NULL) fs = &(p1->fs) ; } /* record for the future */ #if IPFW2 ((ipfw_insn_pipe *)cmd)->pipe_ptr = fs; #else if (fs != NULL) rule->pipe_ptr = fs; #endif return fs ; } /* * dummynet hook for packets. Below 'pipe' is a pipe or a queue * depending on whether WF2Q or fixed bw is used. * * pipe_nr pipe or queue the packet is destined for. * dir where shall we send the packet after dummynet. * m the mbuf with the packet * ifp the 'ifp' parameter from the caller. * NULL in ip_input, destination interface in ip_output, * real_dst in bdg_forward * ro route parameter (only used in ip_output, NULL otherwise) * dst destination address, only used by ip_output * rule matching rule, in case of multiple passes * flags flags from the caller, only used in ip_output * */ static int dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) { struct dn_pkt *pkt; struct dn_flow_set *fs; struct dn_pipe *pipe ; u_int64_t len = m->m_pkthdr.len ; struct dn_flow_queue *q = NULL ; int s = splimp(); int is_pipe; #if IPFW2 ipfw_insn *cmd = fwa->rule->cmd + fwa->rule->act_ofs; if (cmd->opcode == O_LOG) cmd += F_LEN(cmd); is_pipe = (cmd->opcode == O_PIPE); #else is_pipe = (fwa->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE; #endif pipe_nr &= 0xffff ; /* * this is a dummynet rule, so we expect a O_PIPE or O_QUEUE rule */ fs = locate_flowset(pipe_nr, fwa->rule); if (fs == NULL) goto dropit ; /* this queue/pipe does not exist! */ pipe = fs->pipe ; if (pipe == NULL) { /* must be a queue, try find a matching pipe */ for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr; pipe = pipe->next) ; if (pipe != NULL) fs->pipe = pipe ; else { printf("No pipe %d for queue %d, drop pkt\n", fs->parent_nr, fs->fs_nr); goto dropit ; } } q = find_queue(fs, &(fwa->f_id)); if ( q == NULL ) goto dropit ; /* cannot allocate queue */ /* * update statistics, then check reasons to drop pkt */ q->tot_bytes += len ; q->tot_pkts++ ; if ( fs->plr && random() < fs->plr ) goto dropit ; /* random pkt drop */ if ( fs->flags_fs & DN_QSIZE_IS_BYTES) { if (q->len_bytes > fs->qsize) goto dropit ; /* queue size overflow */ } else { if (q->len >= fs->qsize) goto dropit ; /* queue count overflow */ } if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) ) goto dropit ; /* XXX expensive to zero, see if we can remove it*/ pkt = (struct dn_pkt *)malloc(sizeof (*pkt), M_DUMMYNET, M_NOWAIT|M_ZERO); if ( pkt == NULL ) goto dropit ; /* cannot allocate packet header */ /* ok, i can handle the pkt now... */ /* build and enqueue packet + parameters */ pkt->hdr.mh_type = MT_TAG; pkt->hdr.mh_flags = PACKET_TAG_DUMMYNET; pkt->rule = fwa->rule ; DN_NEXT(pkt) = NULL; pkt->dn_m = m; pkt->dn_dir = dir ; pkt->ifp = fwa->oif; if (dir == DN_TO_IP_OUT) { /* * We need to copy *ro because for ICMP pkts (and maybe others) * the caller passed a pointer into the stack; dst might also be * a pointer into *ro so it needs to be updated. */ pkt->ro = *(fwa->ro); if (fwa->ro->ro_rt) fwa->ro->ro_rt->rt_refcnt++ ; if (fwa->dst == (struct sockaddr_in *)&fwa->ro->ro_dst) /* dst points into ro */ fwa->dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ; pkt->dn_dst = fwa->dst; pkt->flags = fwa->flags; } if (q->head == NULL) q->head = pkt; else DN_NEXT(q->tail) = pkt; q->tail = pkt; q->len++; q->len_bytes += len ; if ( q->head != pkt ) /* flow was not idle, we are done */ goto done; /* * If we reach this point the flow was previously idle, so we need * to schedule it. This involves different actions for fixed-rate or * WF2Q queues. */ if (is_pipe) { /* * Fixed-rate queue: just insert into the ready_heap. */ dn_key t = 0 ; if (pipe->bandwidth) t = SET_TICKS(pkt, q, pipe); q->sched_time = curr_time ; if (t == 0) /* must process it now */ ready_event( q ); else heap_insert(&ready_heap, curr_time + t , q ); } else { /* * WF2Q. First, compute start time S: if the flow was idle (S=F+1) * set S to the virtual time V for the controlling pipe, and update * the sum of weights for the pipe; otherwise, remove flow from * idle_heap and set S to max(F,V). * Second, compute finish time F = S + len/weight. * Third, if pipe was idle, update V=max(S, V). * Fourth, count one more backlogged flow. */ if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */ q->S = pipe->V ; pipe->sum += fs->weight ; /* add weight of new queue */ } else { heap_extract(&(pipe->idle_heap), q); q->S = MAX64(q->F, pipe->V ) ; } q->F = q->S + ( len<weight; if (pipe->not_eligible_heap.elements == 0 && pipe->scheduler_heap.elements == 0) pipe->V = MAX64 ( q->S, pipe->V ); fs->backlogged++ ; /* * Look at eligibility. A flow is not eligibile if S>V (when * this happens, it means that there is some other flow already * scheduled for the same pipe, so the scheduler_heap cannot be * empty). If the flow is not eligible we just store it in the * not_eligible_heap. Otherwise, we store in the scheduler_heap * and possibly invoke ready_event_wfq() right now if there is * leftover credit. * Note that for all flows in scheduler_heap (SCH), S_i <= V, * and for all flows in not_eligible_heap (NEH), S_i > V . * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH, * we only need to look into NEH. */ if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */ if (pipe->scheduler_heap.elements == 0) printf("++ ouch! not eligible but empty scheduler!\n"); heap_insert(&(pipe->not_eligible_heap), q->S, q); } else { heap_insert(&(pipe->scheduler_heap), q->F, q); if (pipe->numbytes >= 0) { /* pipe is idle */ if (pipe->scheduler_heap.elements != 1) printf("*** OUCH! pipe should have been idle!\n"); DEB(printf("Waking up pipe %d at %d\n", pipe->pipe_nr, (int)(q->F >> MY_M)); ) pipe->sched_time = curr_time ; ready_event_wfq(pipe); } } } done: splx(s); return 0; dropit: splx(s); if (q) q->drops++ ; m_freem(m); return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS); } /* * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT) * Doing this would probably save us the initial bzero of dn_pkt */ #define DN_FREE_PKT(pkt) { \ struct dn_pkt *n = pkt ; \ rt_unref ( n->ro.ro_rt ) ; \ m_freem(n->dn_m); \ pkt = DN_NEXT(n) ; \ free(n, M_DUMMYNET) ; } /* * Dispose all packets and flow_queues on a flow_set. * If all=1, also remove red lookup table and other storage, * including the descriptor itself. * For the one in dn_pipe MUST also cleanup ready_heap... */ static void purge_flow_set(struct dn_flow_set *fs, int all) { struct dn_pkt *pkt ; struct dn_flow_queue *q, *qn ; int i ; for (i = 0 ; i <= fs->rq_size ; i++ ) { for (q = fs->rq[i] ; q ; q = qn ) { for (pkt = q->head ; pkt ; ) DN_FREE_PKT(pkt) ; qn = q->next ; free(q, M_DUMMYNET); } fs->rq[i] = NULL ; } fs->rq_elements = 0 ; if (all) { /* RED - free lookup table */ if (fs->w_q_lookup) free(fs->w_q_lookup, M_DUMMYNET); if (fs->rq) free(fs->rq, M_DUMMYNET); /* if this fs is not part of a pipe, free it */ if (fs->pipe && fs != &(fs->pipe->fs) ) free(fs, M_DUMMYNET); } } /* * Dispose all packets queued on a pipe (not a flow_set). * Also free all resources associated to a pipe, which is about * to be deleted. */ static void purge_pipe(struct dn_pipe *pipe) { struct dn_pkt *pkt ; purge_flow_set( &(pipe->fs), 1 ); for (pkt = pipe->head ; pkt ; ) DN_FREE_PKT(pkt) ; heap_free( &(pipe->scheduler_heap) ); heap_free( &(pipe->not_eligible_heap) ); heap_free( &(pipe->idle_heap) ); } /* * Delete all pipes and heaps returning memory. Must also * remove references from all ipfw rules to all pipes. */ static void dummynet_flush() { struct dn_pipe *curr_p, *p ; struct dn_flow_set *fs, *curr_fs; int s ; s = splimp() ; /* remove all references to pipes ...*/ flush_pipe_ptrs(NULL); /* prevent future matches... */ p = all_pipes ; all_pipes = NULL ; fs = all_flow_sets ; all_flow_sets = NULL ; /* and free heaps so we don't have unwanted events */ heap_free(&ready_heap); heap_free(&wfq_ready_heap); heap_free(&extract_heap); splx(s) ; /* * Now purge all queued pkts and delete all pipes */ /* scan and purge all flow_sets. */ for ( ; fs ; ) { curr_fs = fs ; fs = fs->next ; purge_flow_set(curr_fs, 1); } for ( ; p ; ) { purge_pipe(p); curr_p = p ; p = p->next ; free(curr_p, M_DUMMYNET); } } extern struct ip_fw *ip_fw_default_rule ; static void dn_rule_delete_fs(struct dn_flow_set *fs, void *r) { int i ; struct dn_flow_queue *q ; struct dn_pkt *pkt ; for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */ for (q = fs->rq[i] ; q ; q = q->next ) for (pkt = q->head ; pkt ; pkt = DN_NEXT(pkt) ) if (pkt->rule == r) pkt->rule = ip_fw_default_rule ; } /* * when a firewall rule is deleted, scan all queues and remove the flow-id * from packets matching this rule. */ void dn_rule_delete(void *r) { struct dn_pipe *p ; struct dn_pkt *pkt ; struct dn_flow_set *fs ; /* * If the rule references a queue (dn_flow_set), then scan * the flow set, otherwise scan pipes. Should do either, but doing * both does not harm. */ for ( fs = all_flow_sets ; fs ; fs = fs->next ) dn_rule_delete_fs(fs, r); for ( p = all_pipes ; p ; p = p->next ) { fs = &(p->fs) ; dn_rule_delete_fs(fs, r); for (pkt = p->head ; pkt ; pkt = DN_NEXT(pkt) ) if (pkt->rule == r) pkt->rule = ip_fw_default_rule ; } } /* * setup RED parameters */ static int config_red(struct dn_flow_set *p, struct dn_flow_set * x) { int i; x->w_q = p->w_q; x->min_th = SCALE(p->min_th); x->max_th = SCALE(p->max_th); x->max_p = p->max_p; x->c_1 = p->max_p / (p->max_th - p->min_th); x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th)); if (x->flags_fs & DN_IS_GENTLE_RED) { x->c_3 = (SCALE(1) - p->max_p) / p->max_th; x->c_4 = (SCALE(1) - 2 * p->max_p); } /* if the lookup table already exist, free and create it again */ if (x->w_q_lookup) { free(x->w_q_lookup, M_DUMMYNET); x->w_q_lookup = NULL ; } if (red_lookup_depth == 0) { printf("\nnet.inet.ip.dummynet.red_lookup_depth must be > 0"); free(x, M_DUMMYNET); return EINVAL; } x->lookup_depth = red_lookup_depth; x->w_q_lookup = (u_int *) malloc(x->lookup_depth * sizeof(int), M_DUMMYNET, M_DONTWAIT); if (x->w_q_lookup == NULL) { printf("sorry, cannot allocate red lookup table\n"); free(x, M_DUMMYNET); return ENOSPC; } /* fill the lookup table with (1 - w_q)^x */ x->lookup_step = p->lookup_step ; x->lookup_weight = p->lookup_weight ; x->w_q_lookup[0] = SCALE(1) - x->w_q; for (i = 1; i < x->lookup_depth; i++) x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight); if (red_avg_pkt_size < 1) red_avg_pkt_size = 512 ; x->avg_pkt_size = red_avg_pkt_size ; if (red_max_pkt_size < 1) red_max_pkt_size = 1500 ; x->max_pkt_size = red_max_pkt_size ; return 0 ; } static int alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs) { if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */ int l = pfs->rq_size; if (l == 0) l = dn_hash_size; if (l < 4) l = 4; else if (l > DN_MAX_HASH_SIZE) l = DN_MAX_HASH_SIZE; x->rq_size = l; } else /* one is enough for null mask */ x->rq_size = 1; x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *), M_DUMMYNET, M_DONTWAIT | M_ZERO); if (x->rq == NULL) { printf("sorry, cannot allocate queue\n"); return ENOSPC; } x->rq_elements = 0; return 0 ; } static void set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) { x->flags_fs = src->flags_fs; x->qsize = src->qsize; x->plr = src->plr; x->flow_mask = src->flow_mask; if (x->flags_fs & DN_QSIZE_IS_BYTES) { if (x->qsize > 1024*1024) x->qsize = 1024*1024 ; } else { if (x->qsize == 0) x->qsize = 50 ; if (x->qsize > 100) x->qsize = 50 ; } /* configuring RED */ if ( x->flags_fs & DN_IS_RED ) config_red(src, x) ; /* XXX should check errors */ } /* * setup pipe or queue parameters. */ static int config_pipe(struct dn_pipe *p) { int s ; struct dn_flow_set *pfs = &(p->fs); /* * The config program passes parameters as follows: * bw = bits/second (0 means no limits), * delay = ms, must be translated into ticks. * qsize = slots/bytes */ p->delay = ( p->delay * hz ) / 1000 ; /* We need either a pipe number or a flow_set number */ if (p->pipe_nr == 0 && pfs->fs_nr == 0) return EINVAL ; if (p->pipe_nr != 0 && pfs->fs_nr != 0) return EINVAL ; if (p->pipe_nr != 0) { /* this is a pipe */ struct dn_pipe *x, *a, *b; /* locate pipe */ for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; a = b , b = b->next) ; if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */ x = malloc(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO); if (x == NULL) { printf("ip_dummynet.c: no memory for new pipe\n"); return ENOSPC; } x->pipe_nr = p->pipe_nr; x->fs.pipe = x ; /* idle_heap is the only one from which we extract from the middle. */ x->idle_heap.size = x->idle_heap.elements = 0 ; x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos); } else x = b; x->bandwidth = p->bandwidth ; x->numbytes = 0; /* just in case... */ bcopy(p->if_name, x->if_name, sizeof(p->if_name) ); x->ifp = NULL ; /* reset interface ptr */ x->delay = p->delay ; set_fs_parms(&(x->fs), pfs); if ( x->fs.rq == NULL ) { /* a new pipe */ s = alloc_hash(&(x->fs), pfs) ; if (s) { free(x, M_DUMMYNET); return s ; } s = splimp() ; x->next = b ; if (a == NULL) all_pipes = x ; else a->next = x ; splx(s); } } else { /* config queue */ struct dn_flow_set *x, *a, *b ; /* locate flow_set */ for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ; a = b , b = b->next) ; if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */ if (pfs->parent_nr == 0) /* need link to a pipe */ return EINVAL ; x = malloc(sizeof(struct dn_flow_set),M_DUMMYNET,M_DONTWAIT|M_ZERO); if (x == NULL) { printf("ip_dummynet.c: no memory for new flow_set\n"); return ENOSPC; } x->fs_nr = pfs->fs_nr; x->parent_nr = pfs->parent_nr; x->weight = pfs->weight ; if (x->weight == 0) x->weight = 1 ; else if (x->weight > 100) x->weight = 100 ; } else { /* Change parent pipe not allowed; must delete and recreate */ if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) return EINVAL ; x = b; } set_fs_parms(x, pfs); if ( x->rq == NULL ) { /* a new flow_set */ s = alloc_hash(x, pfs) ; if (s) { free(x, M_DUMMYNET); return s ; } s = splimp() ; x->next = b; if (a == NULL) all_flow_sets = x; else a->next = x; splx(s); } } return 0 ; } /* * Helper function to remove from a heap queues which are linked to * a flow_set about to be deleted. */ static void fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs) { int i = 0, found = 0 ; for (; i < h->elements ;) if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) { h->elements-- ; h->p[i] = h->p[h->elements] ; found++ ; } else i++ ; if (found) heapify(h); } /* * helper function to remove a pipe from a heap (can be there at most once) */ static void pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p) { if (h->elements > 0) { int i = 0 ; for (i=0; i < h->elements ; i++ ) { if (h->p[i].object == p) { /* found it */ h->elements-- ; h->p[i] = h->p[h->elements] ; heapify(h); break ; } } } } /* * drain all queues. Called in case of severe mbuf shortage. */ void dummynet_drain() { struct dn_flow_set *fs; struct dn_pipe *p; struct dn_pkt *pkt; heap_free(&ready_heap); heap_free(&wfq_ready_heap); heap_free(&extract_heap); /* remove all references to this pipe from flow_sets */ for (fs = all_flow_sets; fs; fs= fs->next ) purge_flow_set(fs, 0); for (p = all_pipes; p; p= p->next ) { purge_flow_set(&(p->fs), 0); for (pkt = p->head ; pkt ; ) DN_FREE_PKT(pkt) ; p->head = p->tail = NULL ; } } /* * Fully delete a pipe or a queue, cleaning up associated info. */ static int delete_pipe(struct dn_pipe *p) { int s ; if (p->pipe_nr == 0 && p->fs.fs_nr == 0) return EINVAL ; if (p->pipe_nr != 0 && p->fs.fs_nr != 0) return EINVAL ; if (p->pipe_nr != 0) { /* this is an old-style pipe */ struct dn_pipe *a, *b; struct dn_flow_set *fs; /* locate pipe */ for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; a = b , b = b->next) ; if (b == NULL || (b->pipe_nr != p->pipe_nr) ) return EINVAL ; /* not found */ s = splimp() ; /* unlink from list of pipes */ if (a == NULL) all_pipes = b->next ; else a->next = b->next ; /* remove references to this pipe from the ip_fw rules. */ flush_pipe_ptrs(&(b->fs)); /* remove all references to this pipe from flow_sets */ for (fs = all_flow_sets; fs; fs= fs->next ) if (fs->pipe == b) { printf("++ ref to pipe %d from fs %d\n", p->pipe_nr, fs->fs_nr); fs->pipe = NULL ; purge_flow_set(fs, 0); } fs_remove_from_heap(&ready_heap, &(b->fs)); purge_pipe(b); /* remove all data associated to this pipe */ /* remove reference to here from extract_heap and wfq_ready_heap */ pipe_remove_from_heap(&extract_heap, b); pipe_remove_from_heap(&wfq_ready_heap, b); splx(s); free(b, M_DUMMYNET); } else { /* this is a WF2Q queue (dn_flow_set) */ struct dn_flow_set *a, *b; /* locate set */ for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ; a = b , b = b->next) ; if (b == NULL || (b->fs_nr != p->fs.fs_nr) ) return EINVAL ; /* not found */ s = splimp() ; if (a == NULL) all_flow_sets = b->next ; else a->next = b->next ; /* remove references to this flow_set from the ip_fw rules. */ flush_pipe_ptrs(b); if (b->pipe != NULL) { /* Update total weight on parent pipe and cleanup parent heaps */ b->pipe->sum -= b->weight * b->backlogged ; fs_remove_from_heap(&(b->pipe->not_eligible_heap), b); fs_remove_from_heap(&(b->pipe->scheduler_heap), b); #if 1 /* XXX should i remove from idle_heap as well ? */ fs_remove_from_heap(&(b->pipe->idle_heap), b); #endif } purge_flow_set(b, 1); splx(s); } return 0 ; } /* * helper function used to copy data from kernel in DUMMYNET_GET */ static char * dn_copy_set(struct dn_flow_set *set, char *bp) { int i, copied = 0 ; struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp; for (i = 0 ; i <= set->rq_size ; i++) for (q = set->rq[i] ; q ; q = q->next, qp++ ) { if (q->hash_slot != i) printf("++ at %d: wrong slot (have %d, " "should be %d)\n", copied, q->hash_slot, i); if (q->fs != set) printf("++ at %d: wrong fs ptr (have %p, should be %p)\n", i, q->fs, set); copied++ ; bcopy(q, qp, sizeof( *q ) ); /* cleanup pointers */ qp->next = NULL ; qp->head = qp->tail = NULL ; qp->fs = NULL ; } if (copied != set->rq_elements) printf("++ wrong count, have %d should be %d\n", copied, set->rq_elements); return (char *)qp ; } static int dummynet_get(struct sockopt *sopt) { char *buf, *bp ; /* bp is the "copy-pointer" */ size_t size ; struct dn_flow_set *set ; struct dn_pipe *p ; int s, error=0 ; s = splimp(); /* * compute size of data structures: list of pipes and flow_sets. */ for (p = all_pipes, size = 0 ; p ; p = p->next ) size += sizeof( *p ) + p->fs.rq_elements * sizeof(struct dn_flow_queue); for (set = all_flow_sets ; set ; set = set->next ) size += sizeof ( *set ) + set->rq_elements * sizeof(struct dn_flow_queue); buf = malloc(size, M_TEMP, M_DONTWAIT); if (buf == 0) { splx(s); return ENOBUFS ; } for (p = all_pipes, bp = buf ; p ; p = p->next ) { struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ; /* * copy pipe descriptor into *bp, convert delay back to ms, * then copy the flow_set descriptor(s) one at a time. * After each flow_set, copy the queue descriptor it owns. */ bcopy(p, bp, sizeof( *p ) ); pipe_bp->delay = (pipe_bp->delay * 1000) / hz ; /* * XXX the following is a hack based on ->next being the * first field in dn_pipe and dn_flow_set. The correct * solution would be to move the dn_flow_set to the beginning * of struct dn_pipe. */ pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ; /* clean pointers */ pipe_bp->head = pipe_bp->tail = NULL ; pipe_bp->fs.next = NULL ; pipe_bp->fs.pipe = NULL ; pipe_bp->fs.rq = NULL ; bp += sizeof( *p ) ; bp = dn_copy_set( &(p->fs), bp ); } for (set = all_flow_sets ; set ; set = set->next ) { struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ; bcopy(set, bp, sizeof( *set ) ); /* XXX same hack as above */ fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ; fs_bp->pipe = NULL ; fs_bp->rq = NULL ; bp += sizeof( *set ) ; bp = dn_copy_set( set, bp ); } splx(s); error = sooptcopyout(sopt, buf, size); free(buf, M_TEMP); return error ; } /* * Handler for the various dummynet socket options (get, flush, config, del) */ static int ip_dn_ctl(struct sockopt *sopt) { int error = 0 ; struct dn_pipe *p, tmp_pipe; /* Disallow sets in really-really secure mode. */ if (sopt->sopt_dir == SOPT_SET) { #if __FreeBSD_version >= 500034 error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error) return (error); #else if (securelevel >= 3) return (EPERM); #endif } switch (sopt->sopt_name) { default : printf("ip_dn_ctl -- unknown option %d", sopt->sopt_name); return EINVAL ; case IP_DUMMYNET_GET : error = dummynet_get(sopt); break ; case IP_DUMMYNET_FLUSH : dummynet_flush() ; break ; case IP_DUMMYNET_CONFIGURE : p = &tmp_pipe ; error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); if (error) break ; error = config_pipe(p); break ; case IP_DUMMYNET_DEL : /* remove a pipe or queue */ p = &tmp_pipe ; error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); if (error) break ; error = delete_pipe(p); break ; } return error ; } static void ip_dn_init(void) { printf("DUMMYNET initialized (011031)\n"); all_pipes = NULL ; all_flow_sets = NULL ; ready_heap.size = ready_heap.elements = 0 ; ready_heap.offset = 0 ; wfq_ready_heap.size = wfq_ready_heap.elements = 0 ; wfq_ready_heap.offset = 0 ; extract_heap.size = extract_heap.elements = 0 ; extract_heap.offset = 0 ; ip_dn_ctl_ptr = ip_dn_ctl; ip_dn_io_ptr = dummynet_io; ip_dn_ruledel_ptr = dn_rule_delete; bzero(&dn_timeout, sizeof(struct callout_handle)); dn_timeout = timeout(dummynet, NULL, 1); } static int dummynet_modevent(module_t mod, int type, void *data) { int s; switch (type) { case MOD_LOAD: s = splimp(); if (DUMMYNET_LOADED) { splx(s); printf("DUMMYNET already loaded\n"); return EEXIST ; } ip_dn_init(); splx(s); break; case MOD_UNLOAD: #if !defined(KLD_MODULE) printf("dummynet statically compiled, cannot unload\n"); return EINVAL ; #else s = splimp(); untimeout(dummynet, NULL, dn_timeout); dummynet_flush(); ip_dn_ctl_ptr = NULL; ip_dn_io_ptr = NULL; ip_dn_ruledel_ptr = NULL; splx(s); #endif break ; default: break ; } return 0 ; } static moduledata_t dummynet_mod = { "dummynet", dummynet_modevent, NULL }; DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_DEPEND(dummynet, ipfw, 1, 1, 1); MODULE_VERSION(dummynet, 1); Index: head/sys/netinet/ip_encap.c =================================================================== --- head/sys/netinet/ip_encap.c (revision 105193) +++ head/sys/netinet/ip_encap.c (revision 105194) @@ -1,522 +1,510 @@ /* $FreeBSD$ */ /* $KAME: ip_encap.c,v 1.41 2001/03/15 08:35:08 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * My grandfather said that there's a devil inside tunnelling technology... * * We have surprisingly many protocols that want packets with IP protocol * #4 or #41. Here's a list of protocols that want protocol #41: * RFC1933 configured tunnel * RFC1933 automatic tunnel * RFC2401 IPsec tunnel * RFC2473 IPv6 generic packet tunnelling * RFC2529 6over4 tunnel * mobile-ip6 (uses RFC2473) * RFC3056 6to4 tunnel * isatap tunnel * Here's a list of protocol that want protocol #4: * RFC1853 IPv4-in-IPv4 tunnelling * RFC2003 IPv4 encapsulation within IPv4 * RFC2344 reverse tunnelling for mobile-ip4 * RFC2401 IPsec tunnel * Well, what can I say. They impose different en/decapsulation mechanism * from each other, so they need separate protocol handler. The only one * we can easily determine by protocol # is IPsec, which always has * AH/ESP/IPComp header right after outer IP header. * * So, clearly good old protosw does not work for protocol #4 and #41. * The code will let you match protocol via src/dst address pair. */ /* XXX is M_NETADDR correct? */ #include "opt_mrouting.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #include #include #include static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); static void encap_add(struct encaptab *); static int mask_match(const struct encaptab *, const struct sockaddr *, const struct sockaddr *); static void encap_fillarg(struct mbuf *, const struct encaptab *); #ifndef LIST_HEAD_INITIALIZER /* rely upon BSS initialization */ LIST_HEAD(, encaptab) encaptab; #else LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(&encaptab); #endif void encap_init() { static int initialized = 0; if (initialized) return; initialized++; #if 0 /* * we cannot use LIST_INIT() here, since drivers may want to call * encap_attach(), on driver attach. encap_init() will be called * on AF_INET{,6} initialization, which happens after driver * initialization - using LIST_INIT() here can nuke encap_attach() * from drivers. */ LIST_INIT(&encaptab); #endif } #ifdef INET void encap4_input(m, off) struct mbuf *m; int off; { struct ip *ip; int proto; struct sockaddr_in s, d; const struct protosw *psw; struct encaptab *ep, *match; int prio, matchprio; ip = mtod(m, struct ip *); proto = ip->ip_p; bzero(&s, sizeof(s)); s.sin_family = AF_INET; s.sin_len = sizeof(struct sockaddr_in); s.sin_addr = ip->ip_src; bzero(&d, sizeof(d)); d.sin_family = AF_INET; d.sin_len = sizeof(struct sockaddr_in); d.sin_addr = ip->ip_dst; match = NULL; matchprio = 0; LIST_FOREACH(ep, &encaptab, chain) { if (ep->af != AF_INET) continue; if (ep->proto >= 0 && ep->proto != proto) continue; if (ep->func) prio = (*ep->func)(m, off, proto, ep->arg); else { /* * it's inbound traffic, we need to match in reverse * order */ prio = mask_match(ep, (struct sockaddr *)&d, (struct sockaddr *)&s); } /* * We prioritize the matches by using bit length of the * matches. mask_match() and user-supplied matching function * should return the bit length of the matches (for example, * if both src/dst are matched for IPv4, 64 should be returned). * 0 or negative return value means "it did not match". * * The question is, since we have two "mask" portion, we * cannot really define total order between entries. * For example, which of these should be preferred? * mask_match() returns 48 (32 + 16) for both of them. * src=3ffe::/16, dst=3ffe:501::/32 * src=3ffe:501::/32, dst=3ffe::/16 * * We need to loop through all the possible candidates * to get the best match - the search takes O(n) for * n attachments (i.e. interfaces). */ if (prio <= 0) continue; if (prio > matchprio) { matchprio = prio; match = ep; } } if (match) { /* found a match, "match" has the best one */ psw = match->psw; if (psw && psw->pr_input) { encap_fillarg(m, match); (*psw->pr_input)(m, off); } else m_freem(m); return; } /* last resort: inject to raw socket */ rip_input(m, off); } #endif #ifdef INET6 int encap6_input(mp, offp, proto) struct mbuf **mp; int *offp; int proto; { struct mbuf *m = *mp; struct ip6_hdr *ip6; struct sockaddr_in6 s, d; const struct ip6protosw *psw; struct encaptab *ep, *match; int prio, matchprio; ip6 = mtod(m, struct ip6_hdr *); bzero(&s, sizeof(s)); s.sin6_family = AF_INET6; s.sin6_len = sizeof(struct sockaddr_in6); s.sin6_addr = ip6->ip6_src; bzero(&d, sizeof(d)); d.sin6_family = AF_INET6; d.sin6_len = sizeof(struct sockaddr_in6); d.sin6_addr = ip6->ip6_dst; match = NULL; matchprio = 0; LIST_FOREACH(ep, &encaptab, chain) { if (ep->af != AF_INET6) continue; if (ep->proto >= 0 && ep->proto != proto) continue; if (ep->func) prio = (*ep->func)(m, *offp, proto, ep->arg); else { /* * it's inbound traffic, we need to match in reverse * order */ prio = mask_match(ep, (struct sockaddr *)&d, (struct sockaddr *)&s); } /* see encap4_input() for issues here */ if (prio <= 0) continue; if (prio > matchprio) { matchprio = prio; match = ep; } } if (match) { /* found a match */ psw = (const struct ip6protosw *)match->psw; if (psw && psw->pr_input) { encap_fillarg(m, match); return (*psw->pr_input)(mp, offp, proto); } else { m_freem(m); return IPPROTO_DONE; } } /* last resort: inject to raw socket */ return rip6_input(mp, offp, proto); } #endif static void encap_add(ep) struct encaptab *ep; { LIST_INSERT_HEAD(&encaptab, ep, chain); } /* * sp (src ptr) is always my side, and dp (dst ptr) is always remote side. * length of mask (sm and dm) is assumed to be same as sp/dp. * Return value will be necessary as input (cookie) for encap_detach(). */ const struct encaptab * encap_attach(af, proto, sp, sm, dp, dm, psw, arg) int af; int proto; const struct sockaddr *sp, *sm; const struct sockaddr *dp, *dm; const struct protosw *psw; void *arg; { struct encaptab *ep; int error; int s; s = splnet(); /* sanity check on args */ if (sp->sa_len > sizeof(ep->src) || dp->sa_len > sizeof(ep->dst)) { error = EINVAL; goto fail; } if (sp->sa_len != dp->sa_len) { error = EINVAL; goto fail; } if (af != sp->sa_family || af != dp->sa_family) { error = EINVAL; goto fail; } /* check if anyone have already attached with exactly same config */ LIST_FOREACH(ep, &encaptab, chain) { if (ep->af != af) continue; if (ep->proto != proto) continue; if (ep->src.ss_len != sp->sa_len || bcmp(&ep->src, sp, sp->sa_len) != 0 || bcmp(&ep->srcmask, sm, sp->sa_len) != 0) continue; if (ep->dst.ss_len != dp->sa_len || bcmp(&ep->dst, dp, dp->sa_len) != 0 || bcmp(&ep->dstmask, dm, dp->sa_len) != 0) continue; error = EEXIST; goto fail; } ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/ if (ep == NULL) { error = ENOBUFS; goto fail; } bzero(ep, sizeof(*ep)); ep->af = af; ep->proto = proto; bcopy(sp, &ep->src, sp->sa_len); bcopy(sm, &ep->srcmask, sp->sa_len); bcopy(dp, &ep->dst, dp->sa_len); bcopy(dm, &ep->dstmask, dp->sa_len); ep->psw = psw; ep->arg = arg; encap_add(ep); error = 0; splx(s); return ep; fail: splx(s); return NULL; } const struct encaptab * encap_attach_func(af, proto, func, psw, arg) int af; int proto; int (*func)(const struct mbuf *, int, int, void *); const struct protosw *psw; void *arg; { struct encaptab *ep; int error; int s; s = splnet(); /* sanity check on args */ if (!func) { error = EINVAL; goto fail; } ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/ if (ep == NULL) { error = ENOBUFS; goto fail; } bzero(ep, sizeof(*ep)); ep->af = af; ep->proto = proto; ep->func = func; ep->psw = psw; ep->arg = arg; encap_add(ep); error = 0; splx(s); return ep; fail: splx(s); return NULL; } int encap_detach(cookie) const struct encaptab *cookie; { const struct encaptab *ep = cookie; struct encaptab *p; LIST_FOREACH(p, &encaptab, chain) { if (p == ep) { LIST_REMOVE(p, chain); free(p, M_NETADDR); /*XXX*/ return 0; } } return EINVAL; } static int mask_match(ep, sp, dp) const struct encaptab *ep; const struct sockaddr *sp; const struct sockaddr *dp; { struct sockaddr_storage s; struct sockaddr_storage d; int i; const u_int8_t *p, *q; u_int8_t *r; int matchlen; if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d)) return 0; if (sp->sa_family != ep->af || dp->sa_family != ep->af) return 0; if (sp->sa_len != ep->src.ss_len || dp->sa_len != ep->dst.ss_len) return 0; matchlen = 0; p = (const u_int8_t *)sp; q = (const u_int8_t *)&ep->srcmask; r = (u_int8_t *)&s; for (i = 0 ; i < sp->sa_len; i++) { r[i] = p[i] & q[i]; /* XXX estimate */ matchlen += (q[i] ? 8 : 0); } p = (const u_int8_t *)dp; q = (const u_int8_t *)&ep->dstmask; r = (u_int8_t *)&d; for (i = 0 ; i < dp->sa_len; i++) { r[i] = p[i] & q[i]; /* XXX rough estimate */ matchlen += (q[i] ? 8 : 0); } /* need to overwrite len/family portion as we don't compare them */ s.ss_len = sp->sa_len; s.ss_family = sp->sa_family; d.ss_len = dp->sa_len; d.ss_family = dp->sa_family; if (bcmp(&s, &ep->src, ep->src.ss_len) == 0 && bcmp(&d, &ep->dst, ep->dst.ss_len) == 0) { return matchlen; } else return 0; } static void encap_fillarg(m, ep) struct mbuf *m; const struct encaptab *ep; { -#if 0 - m->m_pkthdr.aux = ep->arg; -#else - struct mbuf *n; + struct m_tag *tag; - n = m_aux_add(m, AF_INET, IPPROTO_IPV4); - if (n) { - *mtod(n, void **) = ep->arg; - n->m_len = sizeof(void *); + tag = m_tag_get(PACKET_TAG_ENCAP, sizeof (void*), M_NOWAIT); + if (tag) { + *(void**)(tag+1) = ep->arg; + m_tag_prepend(m, tag); } -#endif } void * encap_getarg(m) struct mbuf *m; { - void *p; -#if 0 - p = m->m_pkthdr.aux; - m->m_pkthdr.aux = NULL; - return p; -#else - struct mbuf *n; + void *p = NULL; + struct m_tag *tag; - p = NULL; - n = m_aux_find(m, AF_INET, IPPROTO_IPV4); - if (n) { - if (n->m_len == sizeof(void *)) - p = *mtod(n, void **); - m_aux_delete(m, n); + tag = m_tag_find(m, PACKET_TAG_ENCAP, NULL); + if (tag) { + p = *(void**)(tag+1); + m_tag_delete(m, tag); } return p; -#endif } Index: head/sys/netinet/ip_fw2.c =================================================================== --- head/sys/netinet/ip_fw2.c (revision 105193) +++ head/sys/netinet/ip_fw2.c (revision 105194) @@ -1,2770 +1,2770 @@ /* * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #define DEB(x) #define DDB(x) x /* * Implement IP packet firewall (new version) */ #if !defined(KLD_MODULE) #include "opt_ipfw.h" #include "opt_ipdn.h" #include "opt_ipdivert.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #endif #define IPFW2 1 #if IPFW2 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX for ETHERTYPE_IP */ #include /* XXX for in_cksum */ /* * XXX This one should go in sys/mbuf.h. It is used to avoid that * a firewall-generated packet loops forever through the firewall. */ #ifndef M_SKIP_FIREWALL #define M_SKIP_FIREWALL 0x4000 #endif /* * set_disable contains one bit per set value (0..31). * If the bit is set, all rules with the corresponding set * are disabled. Set 31 is reserved for the default rule * and CANNOT be disabled. */ static u_int32_t set_disable; static int fw_verbose; static int verbose_limit; static struct callout_handle ipfw_timeout_h; #define IPFW_DEFAULT_RULE 65535 /* * list of rules for layer 3 */ static struct ip_fw *layer3_chain; MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); static int fw_debug = 1; int fw_one_pass = 1; static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_SECURE, &fw_enable, 0, "Enable ipfw"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW, &autoinc_step, 0, "Rule number autincrement step"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_RW | CTLFLAG_SECURE, &fw_one_pass, 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, &fw_debug, 0, "Enable printing of debug ip_fw statements"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_SECURE, &fw_verbose, 0, "Log matches to ipfw rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); /* * Description of dynamic rules. * * Dynamic rules are stored in lists accessed through a hash table * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can * be modified through the sysctl variable dyn_buckets which is * updated when the table becomes empty. * * XXX currently there is only one list, ipfw_dyn. * * When a packet is received, its address fields are first masked * with the mask defined for the rule, then hashed, then matched * against the entries in the corresponding list. * Dynamic rules can be used for different purposes: * + stateful rules; * + enforcing limits on the number of sessions; * + in-kernel NAT (not implemented yet) * * The lifetime of dynamic rules is regulated by dyn_*_lifetime, * measured in seconds and depending on the flags. * * The total number of dynamic rules is stored in dyn_count. * The max number of dynamic rules is dyn_max. When we reach * the maximum number of rules we do not create anymore. This is * done to avoid consuming too much memory, but also too much * time when searching on each packet (ideally, we should try instead * to put a limit on the length of the list on each bucket...). * * Each dynamic rule holds a pointer to the parent ipfw rule so * we know what action to perform. Dynamic rules are removed when * the parent rule is deleted. XXX we should make them survive. * * There are some limitations with dynamic rules -- we do not * obey the 'randomized match', and we do not do multiple * passes through the firewall. XXX check the latter!!! */ static ipfw_dyn_rule **ipfw_dyn_v = NULL; static u_int32_t dyn_buckets = 256; /* must be power of 2 */ static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */ /* * Timeouts for various events in handing dynamic rules. */ static u_int32_t dyn_ack_lifetime = 300; static u_int32_t dyn_syn_lifetime = 20; static u_int32_t dyn_fin_lifetime = 1; static u_int32_t dyn_rst_lifetime = 1; static u_int32_t dyn_udp_lifetime = 10; static u_int32_t dyn_short_lifetime = 5; /* * Keepalives are sent if dyn_keepalive is set. They are sent every * dyn_keepalive_period seconds, in the last dyn_keepalive_interval * seconds of lifetime of a rule. * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower * than dyn_keepalive_period. */ static u_int32_t dyn_keepalive_interval = 20; static u_int32_t dyn_keepalive_period = 5; static u_int32_t dyn_keepalive = 1; /* do send keepalives */ static u_int32_t static_count; /* # of static rules */ static u_int32_t static_len; /* size in bytes of static rules */ static u_int32_t dyn_count; /* # of dynamic rules */ static u_int32_t dyn_max = 4096; /* max # of dynamic rules */ SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, &dyn_buckets, 0, "Number of dyn. buckets"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD, &curr_dyn_buckets, 0, "Current Number of dyn. buckets"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD, &dyn_count, 0, "Number of dyn. rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW, &dyn_max, 0, "Max number of dyn. rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, &static_count, 0, "Number of static rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW, &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, &dyn_keepalive, 0, "Enable keepalives for dyn. rules"); #endif /* SYSCTL_NODE */ static ip_fw_chk_t ipfw_chk; ip_dn_ruledel_t *ip_dn_ruledel_ptr = NULL; /* hook into dummynet */ /* * This macro maps an ip pointer into a layer3 header pointer of type T */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) static __inline int icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd) { int type = L3HDR(struct icmp,ip)->icmp_type; return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. * * We scan options and store the bits we find set. We succeed if * * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear * * The code is sometimes optimized not to store additional variables. */ static int flags_match(ipfw_insn *cmd, u_int8_t bits) { u_char want_clear; bits = ~bits; if ( ((cmd->arg1 & 0xff) & bits) != 0) return 0; /* some bits we want set were clear */ want_clear = (cmd->arg1 >> 8) & 0xff; if ( (want_clear & bits) != want_clear) return 0; /* some bits we want clear were set */ return 1; } static int ipopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(ip + 1); int x = (ip->ip_hl << 2) - sizeof (struct ip); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { optlen = cp[IPOPT_OLEN]; if (optlen <= 0 || optlen > x) return 0; /* invalid or truncated */ } switch (opt) { default: break; case IPOPT_LSRR: bits |= IP_FW_IPOPT_LSRR; break; case IPOPT_SSRR: bits |= IP_FW_IPOPT_SSRR; break; case IPOPT_RR: bits |= IP_FW_IPOPT_RR; break; case IPOPT_TS: bits |= IP_FW_IPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int tcpopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; struct tcphdr *tcp = L3HDR(struct tcphdr,ip); u_char *cp = (u_char *)(tcp + 1); int x = (tcp->th_off << 2) - sizeof(struct tcphdr); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { optlen = cp[1]; if (optlen <= 0) break; } switch (opt) { default: break; case TCPOPT_MAXSEG: bits |= IP_FW_TCPOPT_MSS; break; case TCPOPT_WINDOW: bits |= IP_FW_TCPOPT_WINDOW; break; case TCPOPT_SACK_PERMITTED: case TCPOPT_SACK: bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_TIMESTAMP: bits |= IP_FW_TCPOPT_TS; break; case TCPOPT_CC: case TCPOPT_CCNEW: case TCPOPT_CCECHO: bits |= IP_FW_TCPOPT_CC; break; } } return (flags_match(cmd, bits)); } static int iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) { if (ifp == NULL) /* no iface with this packet, match fails */ return 0; /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ /* Check unit number (-1 is wildcard) */ if (cmd->p.unit != -1 && cmd->p.unit != ifp->if_unit) return(0); /* Check name */ if (!strncmp(ifp->if_name, cmd->name, IFNAMSIZ)) return(1); } else { struct ifaddr *ia; TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr == NULL) continue; if (ia->ifa_addr->sa_family != AF_INET) continue; if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) return(1); /* match */ } } return(0); /* no match, fail ... */ } static u_int64_t norule_counter; /* counter for ipfw_log(NULL...) */ #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) /* * We enter here when we have a rule with O_LOG. * XXX this function alone takes about 2Kbytes of code! */ static void ipfw_log(struct ip_fw *f, u_int hlen, struct ether_header *eh, struct mbuf *m, struct ifnet *oif) { char *action; int limit_reached = 0; char action2[40], proto[48], fragment[28]; fragment[0] = '\0'; proto[0] = '\0'; if (f == NULL) { /* bogus pkt */ if (verbose_limit != 0 && norule_counter >= verbose_limit) return; norule_counter++; if (norule_counter == verbose_limit) limit_reached = verbose_limit; action = "Refuse"; } else { /* O_LOG is the first action, find the real one */ ipfw_insn *cmd = ACTION_PTR(f); ipfw_insn_log *l = (ipfw_insn_log *)cmd; if (l->max_log != 0 && l->log_left == 0) return; l->log_left--; if (l->log_left == 0) limit_reached = l->max_log; cmd += F_LEN(cmd); /* point to first action */ if (cmd->opcode == O_PROB) cmd += F_LEN(cmd); action = action2; switch (cmd->opcode) { case O_DENY: action = "Deny"; break; case O_REJECT: if (cmd->arg1==ICMP_REJECT_RST) action = "Reset"; else if (cmd->arg1==ICMP_UNREACH_HOST) action = "Reject"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_ACCEPT: action = "Accept"; break; case O_COUNT: action = "Count"; break; case O_DIVERT: snprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1); break; case O_TEE: snprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1); break; case O_SKIPTO: snprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1); break; case O_PIPE: snprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1); break; case O_QUEUE: snprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1); break; case O_FORWARD_IP: { ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; int len; len = snprintf(SNPARGS(action2, 0), "Forward to %s", inet_ntoa(sa->sa.sin_addr)); if (sa->sa.sin_port) snprintf(SNPARGS(action2, len), ":%d", ntohs(sa->sa.sin_port)); } break; default: action = "UNKNOWN"; break; } } if (hlen == 0) { /* non-ip */ snprintf(SNPARGS(proto, 0), "MAC"); } else { struct ip *ip = mtod(m, struct ip *); /* these three are all aliases to the same thing */ struct icmp *const icmp = L3HDR(struct icmp, ip); struct tcphdr *const tcp = (struct tcphdr *)icmp; struct udphdr *const udp = (struct udphdr *)icmp; int ip_off, offset, ip_len; int len; if (eh != NULL) { /* layer 2 packets are as on the wire */ ip_off = ntohs(ip->ip_off); ip_len = ntohs(ip->ip_len); } else { ip_off = ip->ip_off; ip_len = ip->ip_len; } offset = ip_off & IP_OFFMASK; switch (ip->ip_p) { case IPPROTO_TCP: len = snprintf(SNPARGS(proto, 0), "TCP %s", inet_ntoa(ip->ip_src)); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(tcp->th_sport), inet_ntoa(ip->ip_dst), ntohs(tcp->th_dport)); else snprintf(SNPARGS(proto, len), " %s", inet_ntoa(ip->ip_dst)); break; case IPPROTO_UDP: len = snprintf(SNPARGS(proto, 0), "UDP %s", inet_ntoa(ip->ip_src)); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(udp->uh_sport), inet_ntoa(ip->ip_dst), ntohs(udp->uh_dport)); else snprintf(SNPARGS(proto, len), " %s", inet_ntoa(ip->ip_dst)); break; case IPPROTO_ICMP: if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ", icmp->icmp_type, icmp->icmp_code); else len = snprintf(SNPARGS(proto, 0), "ICMP "); len += snprintf(SNPARGS(proto, len), "%s", inet_ntoa(ip->ip_src)); snprintf(SNPARGS(proto, len), " %s", inet_ntoa(ip->ip_dst)); break; default: len = snprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p, inet_ntoa(ip->ip_src)); snprintf(SNPARGS(proto, len), " %s", inet_ntoa(ip->ip_dst)); break; } if (ip_off & (IP_MF | IP_OFFMASK)) snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)", ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2), offset << 3, (ip_off & IP_MF) ? "+" : ""); } if (oif || m->m_pkthdr.rcvif) log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s %s via %s%d%s\n", f ? f->rulenum : -1, action, proto, oif ? "out" : "in", oif ? oif->if_name : m->m_pkthdr.rcvif->if_name, oif ? oif->if_unit : m->m_pkthdr.rcvif->if_unit, fragment); else log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s [no if info]%s\n", f ? f->rulenum : -1, action, proto, fragment); if (limit_reached) log(LOG_SECURITY | LOG_NOTICE, "ipfw: limit %d reached on entry %d\n", limit_reached, f ? f->rulenum : -1); } /* * IMPORTANT: the hash function for dynamic rules must be commutative * in source and destination (ip,port), because rules are bidirectional * and we want to find both in the same bucket. */ static __inline int hash_packet(struct ipfw_flow_id *id) { u_int32_t i; i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); i &= (curr_dyn_buckets - 1); return i; } /** * unlink a dynamic rule from a chain. prev is a pointer to * the previous one, q is a pointer to the rule to delete, * head is a pointer to the head of the queue. * Modifies q and potentially also head. */ #define UNLINK_DYN_RULE(prev, head, q) { \ ipfw_dyn_rule *old_q = q; \ \ /* remove a refcount to the parent */ \ if (q->dyn_type == O_LIMIT) \ q->parent->count--; \ DEB(printf("-- unlink entry 0x%08x %d -> 0x%08x %d, %d left\n", \ (q->id.src_ip), (q->id.src_port), \ (q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); ) \ if (prev != NULL) \ prev->next = q = q->next; \ else \ head = q = q->next; \ dyn_count--; \ free(old_q, M_IPFW); } #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) /** * Remove dynamic rules pointing to "rule", or all of them if rule == NULL. * * If keep_me == NULL, rules are deleted even if not expired, * otherwise only expired rules are removed. * * The value of the second parameter is also used to point to identify * a rule we absolutely do not want to remove (e.g. because we are * holding a reference to it -- this is the case with O_LIMIT_PARENT * rules). The pointer is only used for comparison, so any non-null * value will do. */ static void remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) { static u_int32_t last_remove = 0; #define FORCE (keep_me == NULL) ipfw_dyn_rule *prev, *q; int i, pass = 0, max_pass = 0; if (ipfw_dyn_v == NULL || dyn_count == 0) return; /* do not expire more than once per second, it is useless */ if (!FORCE && last_remove == time_second) return; last_remove = time_second; /* * because O_LIMIT refer to parent rules, during the first pass only * remove child and mark any pending LIMIT_PARENT, and remove * them in a second pass. */ next_pass: for (i = 0 ; i < curr_dyn_buckets ; i++) { for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) { /* * Logic can become complex here, so we split tests. */ if (q == keep_me) goto next; if (rule != NULL && rule != q->rule) goto next; /* not the one we are looking for */ if (q->dyn_type == O_LIMIT_PARENT) { /* * handle parent in the second pass, * record we need one. */ max_pass = 1; if (pass == 0) goto next; if (FORCE && q->count != 0 ) { /* XXX should not happen! */ printf( "OUCH! cannot remove rule," " count %d\n", q->count); } } else { if (!FORCE && !TIME_LEQ( q->expire, time_second )) goto next; } UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); continue; next: prev=q; q=q->next; } } if (pass++ < max_pass) goto next_pass; } /** * lookup a dynamic rule. */ static ipfw_dyn_rule * lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { /* * stateful ipfw extensions. * Lookup into dynamic session queue */ #define MATCH_REVERSE 0 #define MATCH_FORWARD 1 #define MATCH_NONE 2 #define MATCH_UNKNOWN 3 int i, dir = MATCH_NONE; ipfw_dyn_rule *prev, *q=NULL; if (ipfw_dyn_v == NULL) goto done; /* not found */ i = hash_packet( pkt ); for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) { if (q->dyn_type == O_LIMIT_PARENT) goto next; if (TIME_LEQ( q->expire, time_second)) { /* expire entry */ UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); continue; } if ( pkt->proto == q->id.proto) { if (pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port ) { dir = MATCH_FORWARD; break; } if (pkt->src_ip == q->id.dst_ip && pkt->dst_ip == q->id.src_ip && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port ) { dir = MATCH_REVERSE; break; } } next: prev = q; q = q->next; } if (q == NULL) goto done; /* q = NULL, not found */ if ( prev != NULL) { /* found and not in front */ prev->next = q->next; q->next = ipfw_dyn_v[i]; ipfw_dyn_v[i] = q; } if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST); #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); switch (q->state) { case TH_SYN: /* opening */ q->expire = time_second + dyn_syn_lifetime; break; case BOTH_SYN: /* move to established */ case BOTH_SYN | TH_FIN : /* one side tries to close */ case BOTH_SYN | (TH_FIN << 8) : if (tcp) { #define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) u_int32_t ack = ntohl(tcp->th_ack); if (dir == MATCH_FORWARD) { if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd)) q->ack_fwd = ack; else { /* ignore out-of-sequence */ break; } } else { if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev)) q->ack_rev = ack; else { /* ignore out-of-sequence */ break; } } } q->expire = time_second + dyn_ack_lifetime; break; case BOTH_SYN | BOTH_FIN: /* both sides closed */ if (dyn_fin_lifetime >= dyn_keepalive_period) dyn_fin_lifetime = dyn_keepalive_period - 1; q->expire = time_second + dyn_fin_lifetime; break; default: #if 0 /* * reset or some invalid combination, but can also * occur if we use keep-state the wrong way. */ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) printf("invalid state: 0x%x\n", q->state); #endif if (dyn_rst_lifetime >= dyn_keepalive_period) dyn_rst_lifetime = dyn_keepalive_period - 1; q->expire = time_second + dyn_rst_lifetime; break; } } else if (pkt->proto == IPPROTO_UDP) { q->expire = time_second + dyn_udp_lifetime; } else { /* other protocols */ q->expire = time_second + dyn_short_lifetime; } done: if (match_direction) *match_direction = dir; return q; } static void realloc_dynamic_table(void) { /* * Try reallocation, make sure we have a power of 2 and do * not allow more than 64k entries. In case of overflow, * default to 1024. */ if (dyn_buckets > 65536) dyn_buckets = 1024; if ((dyn_buckets & (dyn_buckets-1)) != 0) { /* not a power of 2 */ dyn_buckets = curr_dyn_buckets; /* reset */ return; } curr_dyn_buckets = dyn_buckets; if (ipfw_dyn_v != NULL) free(ipfw_dyn_v, M_IPFW); for (;;) { ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof(ipfw_dyn_rule *), M_IPFW, M_DONTWAIT | M_ZERO); if (ipfw_dyn_v != NULL || curr_dyn_buckets <= 2) break; curr_dyn_buckets /= 2; } } /** * Install state of type 'type' for a dynamic session. * The hash table contains two type of rules: * - regular rules (O_KEEP_STATE) * - rules for sessions with limited number of sess per user * (O_LIMIT). When they are created, the parent is * increased by 1, and decreased on delete. In this case, * the third parameter is the parent rule and not the chain. * - "parent" rules for the above (O_LIMIT_PARENT). */ static ipfw_dyn_rule * add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) { ipfw_dyn_rule *r; int i; if (ipfw_dyn_v == NULL || (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) { realloc_dynamic_table(); if (ipfw_dyn_v == NULL) return NULL; /* failed ! */ } i = hash_packet(id); r = malloc(sizeof *r, M_IPFW, M_DONTWAIT | M_ZERO); if (r == NULL) { printf ("sorry cannot allocate state\n"); return NULL; } /* increase refcount on parent, and set pointer */ if (dyn_type == O_LIMIT) { ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; if ( parent->dyn_type != O_LIMIT_PARENT) panic("invalid parent"); parent->count++; r->parent = parent; rule = parent->rule; } r->id = *id; r->expire = time_second + dyn_syn_lifetime; r->rule = rule; r->dyn_type = dyn_type; r->pcnt = r->bcnt = 0; r->count = 0; r->bucket = i; r->next = ipfw_dyn_v[i]; ipfw_dyn_v[i] = r; dyn_count++; DEB(printf("-- add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n", dyn_type, (r->id.src_ip), (r->id.src_port), (r->id.dst_ip), (r->id.dst_port), dyn_count ); ) return r; } /** * lookup dynamic parent rule using pkt and rule as search keys. * If the lookup fails, then install one. */ static ipfw_dyn_rule * lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) { ipfw_dyn_rule *q; int i; if (ipfw_dyn_v) { i = hash_packet( pkt ); for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next) if (q->dyn_type == O_LIMIT_PARENT && rule== q->rule && pkt->proto == q->id.proto && pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port) { q->expire = time_second + dyn_short_lifetime; DEB(printf("lookup_dyn_parent found 0x%p\n",q);) return q; } } return add_dyn_rule(pkt, O_LIMIT_PARENT, rule); } /** * Install dynamic state for rule type cmd->o.opcode * * Returns 1 (failure) if state is not installed because of errors or because * session limitations are enforced. */ static int install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, struct ip_fw_args *args) { static int last_log; ipfw_dyn_rule *q; DEB(printf("-- install state type %d 0x%08x %u -> 0x%08x %u\n", cmd->o.opcode, (args->f_id.src_ip), (args->f_id.src_port), (args->f_id.dst_ip), (args->f_id.dst_port) );) q = lookup_dyn_rule(&args->f_id, NULL, NULL); if (q != NULL) { /* should never occur */ if (last_log != time_second) { last_log = time_second; printf(" install_state: entry already present, done\n"); } return 0; } if (dyn_count >= dyn_max) /* * Run out of slots, try to remove any expired rule. */ remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); if (dyn_count >= dyn_max) { if (last_log != time_second) { last_log = time_second; printf("install_state: Too many dynamic rules\n"); } return 1; /* cannot install, notify caller */ } switch (cmd->o.opcode) { case O_KEEP_STATE: /* bidir rule */ add_dyn_rule(&args->f_id, O_KEEP_STATE, rule); break; case O_LIMIT: /* limit number of sessions */ { u_int16_t limit_mask = cmd->limit_mask; struct ipfw_flow_id id; ipfw_dyn_rule *parent; DEB(printf("installing dyn-limit rule %d\n", cmd->conn_limit);) id.dst_ip = id.src_ip = 0; id.dst_port = id.src_port = 0; id.proto = args->f_id.proto; if (limit_mask & DYN_SRC_ADDR) id.src_ip = args->f_id.src_ip; if (limit_mask & DYN_DST_ADDR) id.dst_ip = args->f_id.dst_ip; if (limit_mask & DYN_SRC_PORT) id.src_port = args->f_id.src_port; if (limit_mask & DYN_DST_PORT) id.dst_port = args->f_id.dst_port; parent = lookup_dyn_parent(&id, rule); if (parent == NULL) { printf("add parent failed\n"); return 1; } if (parent->count >= cmd->conn_limit) { /* * See if we can remove some expired rule. */ remove_dyn_rule(rule, parent); if (parent->count >= cmd->conn_limit) { if (fw_verbose && last_log != time_second) { last_log = time_second; printf( "drop session, too many entries\n"); } return 1; } } add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent); } break; default: printf("unknown dynamic rule type %u\n", cmd->o.opcode); return 1; } lookup_dyn_rule(&args->f_id, NULL, NULL); /* XXX just set lifetime */ return 0; } /* * Transmit a TCP packet, containing either a RST or a keepalive. * When flags & TH_RST, we are sending a RST packet, because of a * "reset" action matched the packet. * Otherwise we are sending a keepalive, and flags & TH_ */ static void send_pkt(struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { struct mbuf *m; struct ip *ip; struct tcphdr *tcp; struct route sro; /* fake route */ MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == 0) return; m->m_pkthdr.rcvif = (struct ifnet *)0; m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr); m->m_data += max_linkhdr; ip = mtod(m, struct ip *); bzero(ip, m->m_len); tcp = (struct tcphdr *)(ip + 1); /* no IP options */ ip->ip_p = IPPROTO_TCP; tcp->th_off = 5; /* * Assume we are sending a RST (or a keepalive in the reverse * direction), swap src and destination addresses and ports. */ ip->ip_src.s_addr = htonl(id->dst_ip); ip->ip_dst.s_addr = htonl(id->src_ip); tcp->th_sport = htons(id->dst_port); tcp->th_dport = htons(id->src_port); if (flags & TH_RST) { /* we are sending a RST */ if (flags & TH_ACK) { tcp->th_seq = htonl(ack); tcp->th_ack = htonl(0); tcp->th_flags = TH_RST; } else { if (flags & TH_SYN) seq++; tcp->th_seq = htonl(0); tcp->th_ack = htonl(seq); tcp->th_flags = TH_RST | TH_ACK; } } else { /* * We are sending a keepalive. flags & TH_SYN determines * the direction, forward if set, reverse if clear. * NOTE: seq and ack are always assumed to be correct * as set by the caller. This may be confusing... */ if (flags & TH_SYN) { /* * we have to rewrite the correct addresses! */ ip->ip_dst.s_addr = htonl(id->dst_ip); ip->ip_src.s_addr = htonl(id->src_ip); tcp->th_dport = htons(id->dst_port); tcp->th_sport = htons(id->src_port); } tcp->th_seq = htonl(seq); tcp->th_ack = htonl(ack); tcp->th_flags = TH_ACK; } /* * set ip_len to the payload size so we can compute * the tcp checksum on the pseudoheader * XXX check this, could save a couple of words ? */ ip->ip_len = htons(sizeof(struct tcphdr)); tcp->th_sum = in_cksum(m, m->m_pkthdr.len); /* * now fill fields left out earlier */ ip->ip_ttl = ip_defttl; ip->ip_len = m->m_pkthdr.len; bzero (&sro, sizeof (sro)); ip_rtaddr(ip->ip_dst, &sro); m->m_flags |= M_SKIP_FIREWALL; - ip_output(m, NULL, &sro, 0, NULL); + ip_output(m, NULL, &sro, 0, NULL, NULL); if (sro.ro_rt) RTFREE(sro.ro_rt); } /* * sends a reject message, consuming the mbuf passed as an argument. */ static void send_reject(struct ip_fw_args *args, int code, int offset, int ip_len) { if (code != ICMP_REJECT_RST) /* Send an ICMP unreach */ icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) send_pkt(&(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); m_freem(args->m); } else m_freem(args->m); args->m = NULL; } /** * * Given an ip_fw *, lookup_next_rule will return a pointer * to the next rule, which can be either the jump * target (for skipto instructions) or the next one in the list (in * all other cases including a missing jump target). * The result is also written in the "next_rule" field of the rule. * Backward jumps are not allowed, so start looking from the next * rule... * * This never returns NULL -- in case we do not have an exact match, * the next rule is returned. When the ruleset is changed, * pointers are flushed so we are always correct. */ static struct ip_fw * lookup_next_rule(struct ip_fw *me) { struct ip_fw *rule = NULL; ipfw_insn *cmd; /* look for action, in case it is a skipto */ cmd = ACTION_PTR(me); if ( cmd->opcode == O_SKIPTO ) for (rule = me->next; rule ; rule = rule->next) if (rule->rulenum >= cmd->arg1) break; if (rule == NULL) /* failure or not a skipto */ rule = me->next; me->next_rule = rule; return rule; } /* * The main check routine for the firewall. * * All arguments are in args so we can modify them and return them * back to the caller. * * Parameters: * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. * args->eh (in) Mac header if present, or NULL for layer3 packet. * args->oif Outgoing interface, or NULL if packet is incoming. * The incoming interface is in the mbuf. (in) * args->divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) * * Return value: * * IP_FW_PORT_DENY_FLAG the packet must be dropped. * 0 The packet is to be accepted and routed normally OR * the packet was denied/rejected and has been dropped; * in the latter case, *m is equal to NULL upon return. * port Divert the packet to port, with these caveats: * * - If IP_FW_PORT_TEE_FLAG is set, tee the packet instead * of diverting it (ie, 'ipfw tee'). * * - If IP_FW_PORT_DYNT_FLAG is set, interpret the lower * 16 bits as a dummynet pipe number instead of diverting */ static int ipfw_chk(struct ip_fw_args *args) { /* * Local variables hold state during the processing of a packet. * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which * are documented here. Should you change them, please check * the implementation of the various instructions to make sure * that they still work. * * args->eh The MAC header. It is non-null for a layer2 * packet, it is NULL for a layer-3 packet. * * m | args->m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies * or consumes the buffer. * ip is simply an alias of the value of m, and it is kept * in sync with it (the packet is supposed to start with * the ip header). */ struct mbuf *m = args->m; struct ip *ip = mtod(m, struct ip *); /* * oif | args->oif If NULL, ipfw_chk has been called on the * inbound path (ether_input, bdg_forward, ip_input). * If non-NULL, ipfw_chk has been called on the outbound path * (ether_output, ip_output). */ struct ifnet *oif = args->oif; struct ip_fw *f = NULL; /* matching rule */ int retval = 0; /* * hlen The length of the IPv4 header. * hlen >0 means we have an IPv4 packet. */ u_int hlen = 0; /* hlen >0 means we have an IP pkt */ /* * offset The offset of a fragment. offset != 0 means that * we have a fragment at this offset of an IPv4 packet. * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. */ u_short offset = 0; /* * Local copies of addresses. They are only valid if we have * an IP packet. * * proto The protocol. Set to 0 for non-ip packets, * or to the protocol read from the packet otherwise. * proto != 0 means that we have an IPv4 packet. * * src_port, dst_port port numbers, in HOST format. Only * valid for TCP and UDP packets. * * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ u_int8_t proto; u_int16_t src_port = 0, dst_port = 0; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ u_int16_t ip_len=0; int dyn_dir = MATCH_UNKNOWN; ipfw_dyn_rule *q = NULL; if (m->m_flags & M_SKIP_FIREWALL) return 0; /* accept */ /* * dyn_dir = MATCH_UNKNOWN when rules unchecked, * MATCH_NONE when checked and not matched (q = NULL), * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) */ if (args->eh == NULL || /* layer 3 packet */ ( m->m_pkthdr.len >= sizeof(struct ip) && ntohs(args->eh->ether_type) == ETHERTYPE_IP)) hlen = ip->ip_hl << 2; /* * Collect parameters into local variables for faster matching. */ if (hlen == 0) { /* do not grab addresses for non-ip pkts */ proto = args->f_id.proto = 0; /* mark f_id invalid */ goto after_ip_checks; } proto = args->f_id.proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; if (args->eh != NULL) { /* layer 2 packets are as on the wire */ offset = ntohs(ip->ip_off) & IP_OFFMASK; ip_len = ntohs(ip->ip_len); } else { offset = ip->ip_off & IP_OFFMASK; ip_len = ip->ip_len; } #define PULLUP_TO(len) \ do { \ if ((m)->m_len < (len)) { \ args->m = m = m_pullup(m, (len)); \ if (m == 0) \ goto pullup_failed; \ ip = mtod(m, struct ip *); \ } \ } while (0) if (offset == 0) { switch (proto) { case IPPROTO_TCP: { struct tcphdr *tcp; PULLUP_TO(hlen + sizeof(struct tcphdr)); tcp = L3HDR(struct tcphdr, ip); dst_port = tcp->th_dport; src_port = tcp->th_sport; args->f_id.flags = tcp->th_flags; } break; case IPPROTO_UDP: { struct udphdr *udp; PULLUP_TO(hlen + sizeof(struct udphdr)); udp = L3HDR(struct udphdr, ip); dst_port = udp->uh_dport; src_port = udp->uh_sport; } break; case IPPROTO_ICMP: PULLUP_TO(hlen + 4); /* type, code and checksum. */ args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type; break; default: break; } #undef PULLUP_TO } args->f_id.src_ip = ntohl(src_ip.s_addr); args->f_id.dst_ip = ntohl(dst_ip.s_addr); args->f_id.src_port = src_port = ntohs(src_port); args->f_id.dst_port = dst_port = ntohs(dst_port); after_ip_checks: if (args->rule) { /* * Packet has already been tagged. Look for the next rule * to restart processing. * * If fw_one_pass != 0 then just accept it. * XXX should not happen here, but optimized out in * the caller. */ if (fw_one_pass) return 0; f = args->rule->next_rule; if (f == NULL) f = lookup_next_rule(args->rule); } else { /* * Find the starting rule. It can be either the first * one, or the one after divert_rule if asked so. */ int skipto = args->divert_rule; f = layer3_chain; if (args->eh == NULL && skipto != 0) { if (skipto >= IPFW_DEFAULT_RULE) return(IP_FW_PORT_DENY_FLAG); /* invalid */ while (f && f->rulenum <= skipto) f = f->next; if (f == NULL) /* drop packet */ return(IP_FW_PORT_DENY_FLAG); } } args->divert_rule = 0; /* reset to avoid confusion later */ /* * Now scan the rules, and parse microinstructions for each rule. */ for (; f; f = f->next) { int l, cmdlen; ipfw_insn *cmd; int skip_or; /* skip rest of OR block */ again: if (set_disable & (1 << f->set) ) continue; skip_or = 0; for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { int match; /* * check_body is a jump target used when we find a * CHECK_STATE, and need to jump to the body of * the target rule. */ check_body: cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the * F_OR bit set in all but the last instruction. * The first match will set "skip_or", and cause * the following instructions to be skipped until * past the one with the F_OR bit clear. */ if (skip_or) { /* skip this instruction */ if ((cmd->len & F_OR) == 0) skip_or = 0; /* next one is good */ continue; } match = 0; /* set to 1 if we succeed */ switch (cmd->opcode) { /* * The first set of opcodes compares the packet's * fields with some pattern, setting 'match' if a * match is found. At the end of the loop there is * logic to deal with F_NOT and F_OR flags associated * with the opcode. */ case O_NOP: match = 1; break; case O_FORWARD_MAC: printf("ipfw: opcode %d unimplemented\n", cmd->opcode); break; case O_GID: case O_UID: /* * We only check offset == 0 && proto != 0, * as this ensures that we have an IPv4 * packet with the ports info. */ if (offset!=0) break; { struct inpcbinfo *pi; int wildcard; struct inpcb *pcb; if (proto == IPPROTO_TCP) { wildcard = 0; pi = &tcbinfo; } else if (proto == IPPROTO_UDP) { wildcard = 1; pi = &udbinfo; } else break; pcb = (oif) ? in_pcblookup_hash(pi, dst_ip, htons(dst_port), src_ip, htons(src_port), wildcard, oif) : in_pcblookup_hash(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), wildcard, NULL); if (pcb == NULL || pcb->inp_socket == NULL) break; #if __FreeBSD_version < 500034 #define socheckuid(a,b) ((a)->so_cred->cr_uid == (b)) #endif if (cmd->opcode == O_UID) { match = socheckuid(pcb->inp_socket, (uid_t)((ipfw_insn_u32 *)cmd)->d[0]); } else { match = groupmember( (uid_t)((ipfw_insn_u32 *)cmd)->d[0], pcb->inp_socket->so_cred); } } break; case O_RECV: match = iface_match(m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); break; case O_XMIT: match = iface_match(oif, (ipfw_insn_if *)cmd); break; case O_VIA: match = iface_match(oif ? oif : m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); break; case O_MACADDR2: if (args->eh != NULL) { /* have MAC header */ u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; u_int32_t *hdr = (u_int32_t *)args->eh; match = ( want[0] == (hdr[0] & mask[0]) && want[1] == (hdr[1] & mask[1]) && want[2] == (hdr[2] & mask[2]) ); } break; case O_MAC_TYPE: if (args->eh != NULL) { u_int16_t t = ntohs(args->eh->ether_type); u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (t>=p[0] && t<=p[1]); } break; case O_FRAG: match = (hlen > 0 && offset != 0); break; case O_IN: /* "out" is "not in" */ match = (oif == NULL); break; case O_LAYER2: match = (args->eh != NULL); break; case O_PROTO: /* * We do not allow an arg of 0 so the * check of "proto" only suffices. */ match = (proto == cmd->arg1); break; case O_IP_SRC: match = (hlen > 0 && ((ipfw_insn_ip *)cmd)->addr.s_addr == src_ip.s_addr); break; case O_IP_SRC_MASK: match = (hlen > 0 && ((ipfw_insn_ip *)cmd)->addr.s_addr == (src_ip.s_addr & ((ipfw_insn_ip *)cmd)->mask.s_addr)); break; case O_IP_SRC_ME: if (hlen > 0) { struct ifnet *tif; INADDR_TO_IFP(src_ip, tif); match = (tif != NULL); } break; case O_IP_DST_SET: case O_IP_SRC_SET: if (hlen > 0) { u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? args->f_id.src_ip : args->f_id.dst_ip; if (addr < d[0]) break; addr -= d[0]; /* subtract base */ match = (addr < cmd->arg1) && ( d[ 1 + (addr>>5)] & (1<<(addr & 0x1f)) ); } break; case O_IP_DST: match = (hlen > 0 && ((ipfw_insn_ip *)cmd)->addr.s_addr == dst_ip.s_addr); break; case O_IP_DST_MASK: match = (hlen > 0) && (((ipfw_insn_ip *)cmd)->addr.s_addr == (dst_ip.s_addr & ((ipfw_insn_ip *)cmd)->mask.s_addr)); break; case O_IP_DST_ME: if (hlen > 0) { struct ifnet *tif; INADDR_TO_IFP(dst_ip, tif); match = (tif != NULL); } break; case O_IP_SRCPORT: case O_IP_DSTPORT: /* * offset == 0 && proto != 0 is enough * to guarantee that we have an IPv4 * packet with port info. */ if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) && offset == 0) { u_int16_t x = (cmd->opcode == O_IP_SRCPORT) ? src_port : dst_port ; u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (x>=p[0] && x<=p[1]); } break; case O_ICMPTYPE: match = (offset == 0 && proto==IPPROTO_ICMP && icmptype_match(ip, (ipfw_insn_u32 *)cmd) ); break; case O_IPOPT: match = (hlen > 0 && ipopts_match(ip, cmd) ); break; case O_IPVER: match = (hlen > 0 && cmd->arg1 == ip->ip_v); break; case O_IPTTL: match = (hlen > 0 && cmd->arg1 == ip->ip_ttl); break; case O_IPID: match = (hlen > 0 && cmd->arg1 == ntohs(ip->ip_id)); break; case O_IPLEN: match = (hlen > 0 && cmd->arg1 == ip_len); break; case O_IPPRECEDENCE: match = (hlen > 0 && (cmd->arg1 == (ip->ip_tos & 0xe0)) ); break; case O_IPTOS: match = (hlen > 0 && flags_match(cmd, ip->ip_tos)); break; case O_TCPFLAGS: match = (proto == IPPROTO_TCP && offset == 0 && flags_match(cmd, L3HDR(struct tcphdr,ip)->th_flags)); break; case O_TCPOPTS: match = (proto == IPPROTO_TCP && offset == 0 && tcpopts_match(ip, cmd)); break; case O_TCPSEQ: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == L3HDR(struct tcphdr,ip)->th_seq); break; case O_TCPACK: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == L3HDR(struct tcphdr,ip)->th_ack); break; case O_TCPWIN: match = (proto == IPPROTO_TCP && offset == 0 && cmd->arg1 == L3HDR(struct tcphdr,ip)->th_win); break; case O_ESTAB: /* reject packets which have SYN only */ /* XXX should i also check for TH_ACK ? */ match = (proto == IPPROTO_TCP && offset == 0 && (L3HDR(struct tcphdr,ip)->th_flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); break; case O_LOG: if (fw_verbose) ipfw_log(f, hlen, args->eh, m, oif); match = 1; break; case O_PROB: match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); break; /* * The second set of opcodes represents 'actions', * i.e. the terminal part of a rule once the packet * matches all previous patterns. * Typically there is only one action for each rule, * and the opcode is stored at the end of the rule * (but there are exceptions -- see below). * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, * but we need to do a 'goto done'). * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule * ('goto next_rule', equivalent to a 'break 2'), * or to the SKIPTO target ('goto again' after * having set f, cmd and l), respectively. * * O_LIMIT and O_KEEP_STATE: these opcodes are * not real 'actions', and are stored right * before the 'action' part of the rule. * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise * the packet * must be dropped * ('goto done' after setting retval); * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule * ('goto check_body') if an entry is found, or * (CHECK_STATE only) a jump to the next rule if * the entry is not found ('goto next_rule'). * The result of the lookup is cached to make * further instances of these opcodes are * effectively NOPs. */ case O_LIMIT: case O_KEEP_STATE: if (install_state(f, (ipfw_insn_limit *)cmd, args)) { retval = IP_FW_PORT_DENY_FLAG; goto done; /* error/limit violation */ } match = 1; break; case O_PROBE_STATE: case O_CHECK_STATE: /* * dynamic rules are checked at the first * keep-state or check-state occurrence, * with the result being stored in dyn_dir. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). */ if (dyn_dir == MATCH_UNKNOWN && (q = lookup_dyn_rule(&args->f_id, &dyn_dir, proto == IPPROTO_TCP ? L3HDR(struct tcphdr, ip) : NULL)) != NULL) { /* * Found dynamic entry, update stats * and jump to the 'action' part of * the parent rule. */ q->pcnt++; q->bcnt += ip_len; f = q->rule; cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; goto check_body; } /* * Dynamic entry not found. If CHECK_STATE, * skip to next rule, if PROBE_STATE just * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) goto next_rule; match = 1; break; case O_ACCEPT: retval = 0; /* accept */ goto done; case O_PIPE: case O_QUEUE: args->rule = f; /* report matching rule */ retval = cmd->arg1 | IP_FW_PORT_DYNT_FLAG; goto done; case O_DIVERT: case O_TEE: if (args->eh) /* not on layer 2 */ break; args->divert_rule = f->rulenum; retval = (cmd->opcode == O_DIVERT) ? cmd->arg1 : cmd->arg1 | IP_FW_PORT_TEE_FLAG; goto done; case O_COUNT: case O_SKIPTO: f->pcnt++; /* update stats */ f->bcnt += ip_len; f->timestamp = time_second; if (cmd->opcode == O_COUNT) goto next_rule; /* handle skipto */ if (f->next_rule == NULL) lookup_next_rule(f); f = f->next_rule; goto again; case O_REJECT: /* * Drop the packet and send a reject notice * if the packet is not ICMP (or is an ICMP * query), and it is not multicast/broadcast. */ if (hlen > 0 && (proto != IPPROTO_ICMP || is_icmp_query(ip)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(dst_ip.s_addr)) { send_reject(args, cmd->arg1, offset,ip_len); m = args->m; } /* FALLTHROUGH */ case O_DENY: retval = IP_FW_PORT_DENY_FLAG; goto done; case O_FORWARD_IP: if (args->eh) /* not valid on layer2 pkts */ break; if (!q || dyn_dir == MATCH_FORWARD) args->next_hop = &((ipfw_insn_sa *)cmd)->sa; retval = 0; goto done; default: panic("-- unknown opcode %d\n", cmd->opcode); } /* end of switch() on opcodes */ if (cmd->len & F_NOT) match = !match; if (match) { if (cmd->len & F_OR) skip_or = 1; } else { if (!(cmd->len & F_OR)) /* not an OR block, */ break; /* try next rule */ } } /* end of inner for, scan opcodes */ next_rule:; /* try next rule */ } /* end of outer for, scan rules */ printf("+++ ipfw: ouch!, skip past end of rules, denying packet\n"); return(IP_FW_PORT_DENY_FLAG); done: /* Update statistics */ f->pcnt++; f->bcnt += ip_len; f->timestamp = time_second; return retval; pullup_failed: if (fw_verbose) printf("pullup failed\n"); return(IP_FW_PORT_DENY_FLAG); } /* * When a rule is added/deleted, clear the next_rule pointers in all rules. * These will be reconstructed on the fly as packets are matched. * Must be called at splimp(). */ static void flush_rule_ptrs(void) { struct ip_fw *rule; for (rule = layer3_chain; rule; rule = rule->next) rule->next_rule = NULL; } /* * When pipes/queues are deleted, clear the "pipe_ptr" pointer to a given * pipe/queue, or to all of them (match == NULL). * Must be called at splimp(). */ void flush_pipe_ptrs(struct dn_flow_set *match) { struct ip_fw *rule; for (rule = layer3_chain; rule; rule = rule->next) { ipfw_insn_pipe *cmd = (ipfw_insn_pipe *)ACTION_PTR(rule); if (cmd->o.opcode != O_PIPE && cmd->o.opcode != O_QUEUE) continue; if (match == NULL || cmd->pipe_ptr == match) cmd->pipe_ptr = NULL; } } /* * Add a new rule to the list. Copy the rule into a malloc'ed area, then * possibly create a rule number and add the rule to the list. * Update the rule_number in the input struct so the caller knows it as well. */ static int add_rule(struct ip_fw **head, struct ip_fw *input_rule) { struct ip_fw *rule, *f, *prev; int s; int l = RULESIZE(input_rule); if (*head == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE) return (EINVAL); rule = malloc(l, M_IPFW, M_DONTWAIT | M_ZERO); if (rule == NULL) return (ENOSPC); bcopy(input_rule, rule, l); rule->next = NULL; rule->next_rule = NULL; rule->pcnt = 0; rule->bcnt = 0; rule->timestamp = 0; s = splimp(); if (*head == NULL) { /* default rule */ *head = rule; goto done; } /* * If rulenum is 0, find highest numbered rule before the * default rule, and add autoinc_step */ if (autoinc_step < 1) autoinc_step = 1; else if (autoinc_step > 1000) autoinc_step = 1000; if (rule->rulenum == 0) { /* * locate the highest numbered rule before default */ for (f = *head; f; f = f->next) { if (f->rulenum == IPFW_DEFAULT_RULE) break; rule->rulenum = f->rulenum; } if (rule->rulenum < IPFW_DEFAULT_RULE - autoinc_step) rule->rulenum += autoinc_step; input_rule->rulenum = rule->rulenum; } /* * Now insert the new rule in the right place in the sorted list. */ for (prev = NULL, f = *head; f; prev = f, f = f->next) { if (f->rulenum > rule->rulenum) { /* found the location */ if (prev) { rule->next = f; prev->next = rule; } else { /* head insert */ rule->next = *head; *head = rule; } break; } } flush_rule_ptrs(); done: static_count++; static_len += l; splx(s); DEB(printf("++ installed rule %d, static count now %d\n", rule->rulenum, static_count);) return (0); } /** * Free storage associated with a static rule (including derived * dynamic rules). * The caller is in charge of clearing rule pointers to avoid * dangling pointers. * @return a pointer to the next entry. * Arguments are not checked, so they better be correct. * Must be called at splimp(). */ static struct ip_fw * delete_rule(struct ip_fw **head, struct ip_fw *prev, struct ip_fw *rule) { struct ip_fw *n; int l = RULESIZE(rule); n = rule->next; remove_dyn_rule(rule, NULL /* force removal */); if (prev == NULL) *head = n; else prev->next = n; static_count--; static_len -= l; if (DUMMYNET_LOADED) ip_dn_ruledel_ptr(rule); free(rule, M_IPFW); return n; } /* * Deletes all rules from a chain (including the default rule * if the second argument is set). * Must be called at splimp(). */ static void free_chain(struct ip_fw **chain, int kill_default) { struct ip_fw *rule; flush_rule_ptrs(); /* more efficient to do outside the loop */ while ( (rule = *chain) != NULL && (kill_default || rule->rulenum != IPFW_DEFAULT_RULE) ) delete_rule(chain, NULL, rule); } /** * Remove all rules with given number, and also do set manipulation. * * The argument is an u_int32_t. The low 16 bit are the rule or set number, * the next 8 bits are the new set, the top 8 bits are the command: * * 0 delete rules with given number * 1 delete rules with given set number * 2 move rules with given number to new set * 3 move rules with given set number to new set * 4 swap sets with given numbers */ static int del_entry(struct ip_fw **chain, u_int32_t arg) { struct ip_fw *prev, *rule; int s; u_int16_t rulenum; u_int8_t cmd, new_set; rulenum = arg & 0xffff; cmd = (arg >> 24) & 0xff; new_set = (arg >> 16) & 0xff; if (cmd > 4) return EINVAL; if (new_set > 30) return EINVAL; if (cmd == 0 || cmd == 2) { if (rulenum == IPFW_DEFAULT_RULE) return EINVAL; } else { if (rulenum > 30) return EINVAL; } switch (cmd) { case 0: /* delete rules with given number */ /* * locate first rule to delete */ for (prev = NULL, rule = *chain; rule && rule->rulenum < rulenum; prev = rule, rule = rule->next) ; if (rule->rulenum != rulenum) return EINVAL; s = splimp(); /* no access to rules while removing */ /* * flush pointers outside the loop, then delete all matching * rules. prev remains the same throughout the cycle. */ flush_rule_ptrs(); while (rule && rule->rulenum == rulenum) rule = delete_rule(chain, prev, rule); splx(s); break; case 1: /* delete all rules with given set number */ s = splimp(); flush_rule_ptrs(); for (prev = NULL, rule = *chain; rule ; ) if (rule->set == rulenum) rule = delete_rule(chain, prev, rule); else { prev = rule; rule = rule->next; } splx(s); break; case 2: /* move rules with given number to new set */ s = splimp(); for (rule = *chain; rule ; rule = rule->next) if (rule->rulenum == rulenum) rule->set = new_set; splx(s); break; case 3: /* move rules with given set number to new set */ s = splimp(); for (rule = *chain; rule ; rule = rule->next) if (rule->set == rulenum) rule->set = new_set; splx(s); break; case 4: /* swap two sets */ s = splimp(); for (rule = *chain; rule ; rule = rule->next) if (rule->set == rulenum) rule->set = new_set; else if (rule->set == new_set) rule->set = rulenum; splx(s); break; } return 0; } /* * Clear counters for a specific rule. */ static void clear_counters(struct ip_fw *rule, int log_only) { ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); if (log_only == 0) { rule->bcnt = rule->pcnt = 0; rule->timestamp = 0; } if (l->o.opcode == O_LOG) l->log_left = l->max_log; } /** * Reset some or all counters on firewall rules. * @arg frwl is null to clear all entries, or contains a specific * rule number. * @arg log_only is 1 if we only want to reset logs, zero otherwise. */ static int zero_entry(int rulenum, int log_only) { struct ip_fw *rule; int s; char *msg; if (rulenum == 0) { s = splimp(); norule_counter = 0; for (rule = layer3_chain; rule; rule = rule->next) clear_counters(rule, log_only); splx(s); msg = log_only ? "ipfw: All logging counts reset.\n" : "ipfw: Accounting cleared.\n"; } else { int cleared = 0; /* * We can have multiple rules with the same number, so we * need to clear them all. */ for (rule = layer3_chain; rule; rule = rule->next) if (rule->rulenum == rulenum) { s = splimp(); while (rule && rule->rulenum == rulenum) { clear_counters(rule, log_only); rule = rule->next; } splx(s); cleared = 1; break; } if (!cleared) /* we did not find any matching rules */ return (EINVAL); msg = log_only ? "ipfw: Entry %d logging count reset.\n" : "ipfw: Entry %d cleared.\n"; } if (fw_verbose) log(LOG_SECURITY | LOG_NOTICE, msg, rulenum); return (0); } /* * Check validity of the structure before insert. * Fortunately rules are simple, so this mostly need to check rule sizes. */ static int check_ipfw_struct(struct ip_fw *rule, int size) { int l, cmdlen = 0; int have_action=0; ipfw_insn *cmd; if (size < sizeof(*rule)) { printf("ipfw: rule too short\n"); return (EINVAL); } /* first, check for valid size */ l = RULESIZE(rule); if (l != size) { printf("ipfw: size mismatch (have %d want %d)\n", size, l); return (EINVAL); } /* * Now go for the individual checks. Very simple ones, basically only * instruction sizes. */ for (l = rule->cmd_len, cmd = rule->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (cmdlen > l) { printf("ipfw: opcode %d size truncated\n", cmd->opcode); return EINVAL; } DEB(printf("ipfw: opcode %d\n", cmd->opcode);) switch (cmd->opcode) { case O_NOP: case O_PROBE_STATE: case O_KEEP_STATE: case O_PROTO: case O_IP_SRC_ME: case O_IP_DST_ME: case O_LAYER2: case O_IN: case O_FRAG: case O_IPOPT: case O_IPLEN: case O_IPID: case O_IPTOS: case O_IPPRECEDENCE: case O_IPTTL: case O_IPVER: case O_TCPWIN: case O_TCPFLAGS: case O_TCPOPTS: case O_ESTAB: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_UID: case O_GID: case O_IP_SRC: case O_IP_DST: case O_TCPSEQ: case O_TCPACK: case O_PROB: case O_ICMPTYPE: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; break; case O_LIMIT: if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) goto bad_size; break; case O_LOG: if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) goto bad_size; ((ipfw_insn_log *)cmd)->log_left = ((ipfw_insn_log *)cmd)->max_log; break; case O_IP_SRC_MASK: case O_IP_DST_MASK: if (cmdlen != F_INSN_SIZE(ipfw_insn_ip)) goto bad_size; if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) { printf("ipfw: opcode %d, useless rule\n", cmd->opcode); return EINVAL; } break; case O_IP_SRC_SET: case O_IP_DST_SET: if (cmd->arg1 == 0 || cmd->arg1 > 256) { printf("ipfw: invalid set size %d\n", cmd->arg1); return EINVAL; } if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + (cmd->arg1+31)/32 ) goto bad_size; break; case O_MACADDR2: if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) goto bad_size; break; case O_MAC_TYPE: case O_IP_SRCPORT: case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ if (cmdlen < 2 || cmdlen > 31) goto bad_size; break; case O_RECV: case O_XMIT: case O_VIA: if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) goto bad_size; break; case O_PIPE: case O_QUEUE: if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe)) goto bad_size; goto check_action; case O_FORWARD_IP: if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) goto bad_size; goto check_action; case O_FORWARD_MAC: /* XXX not implemented yet */ case O_CHECK_STATE: case O_COUNT: case O_ACCEPT: case O_DENY: case O_REJECT: case O_SKIPTO: case O_DIVERT: case O_TEE: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; check_action: if (have_action) { printf("ipfw: opcode %d, multiple actions" " not allowed\n", cmd->opcode); return EINVAL; } have_action = 1; if (l != cmdlen) { printf("ipfw: opcode %d, action must be" " last opcode\n", cmd->opcode); return EINVAL; } break; default: printf("ipfw: opcode %d, unknown opcode\n", cmd->opcode); return EINVAL; } } if (have_action == 0) { printf("ipfw: missing action\n"); return EINVAL; } return 0; bad_size: printf("ipfw: opcode %d size %d wrong\n", cmd->opcode, cmdlen); return EINVAL; } /** * {set|get}sockopt parser. */ static int ipfw_ctl(struct sockopt *sopt) { int error, s, rulenum; size_t size; struct ip_fw *bp , *buf, *rule; static u_int32_t rule_buf[255]; /* we copy the data here */ /* * Disallow modifications in really-really secure mode, but still allow * the logging counters to be reset. */ if (sopt->sopt_name == IP_FW_ADD || (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { #if __FreeBSD_version >= 500034 error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error) return (error); #else /* FreeBSD 4.x */ if (securelevel >= 3) return (EPERM); #endif } error = 0; switch (sopt->sopt_name) { case IP_FW_GET: /* * pass up a copy of the current rules. Static rules * come first (the last of which has number IPFW_DEFAULT_RULE), * followed by a possibly empty list of dynamic rule. * The last dynamic rule has NULL in the "next" field. */ s = splimp(); size = static_len; /* size of static rules */ if (ipfw_dyn_v) /* add size of dyn.rules */ size += (dyn_count * sizeof(ipfw_dyn_rule)); /* * XXX todo: if the user passes a short length just to know * how much room is needed, do not bother filling up the * buffer, just jump to the sooptcopyout. */ buf = malloc(size, M_TEMP, M_WAITOK); if (buf == 0) { splx(s); error = ENOBUFS; break; } bp = buf; for (rule = layer3_chain; rule ; rule = rule->next) { int i = RULESIZE(rule); bcopy(rule, bp, i); /* * abuse 'next_rule' to store the set_disable word */ (u_int32_t)(((struct ip_fw *)bp)->next_rule) = set_disable; bp = (struct ip_fw *)((char *)bp + i); } if (ipfw_dyn_v) { int i; ipfw_dyn_rule *p, *dst, *last = NULL; dst = (ipfw_dyn_rule *)bp; for (i = 0 ; i < curr_dyn_buckets ; i++ ) for ( p = ipfw_dyn_v[i] ; p != NULL ; p = p->next, dst++ ) { bcopy(p, dst, sizeof *p); (int)dst->rule = p->rule->rulenum ; /* * store a non-null value in "next". * The userland code will interpret a * NULL here as a marker * for the last dynamic rule. */ dst->next = dst ; last = dst ; dst->expire = TIME_LEQ(dst->expire, time_second) ? 0 : dst->expire - time_second ; } if (last != NULL) /* mark last dynamic rule */ last->next = NULL; } splx(s); error = sooptcopyout(sopt, buf, size); free(buf, M_TEMP); break; case IP_FW_FLUSH: /* * Normally we cannot release the lock on each iteration. * We could do it here only because we start from the head all * the times so there is no risk of missing some entries. * On the other hand, the risk is that we end up with * a very inconsistent ruleset, so better keep the lock * around the whole cycle. * * XXX this code can be improved by resetting the head of * the list to point to the default rule, and then freeing * the old list without the need for a lock. */ s = splimp(); free_chain(&layer3_chain, 0 /* keep default rule */); splx(s); break; case IP_FW_ADD: rule = (struct ip_fw *)rule_buf; /* XXX do a malloc */ error = sooptcopyin(sopt, rule, sizeof(rule_buf), sizeof(struct ip_fw) ); size = sopt->sopt_valsize; if (error || (error = check_ipfw_struct(rule, size))) break; error = add_rule(&layer3_chain, rule); size = RULESIZE(rule); if (!error && sopt->sopt_dir == SOPT_GET) error = sooptcopyout(sopt, rule, size); break; case IP_FW_DEL: /* * IP_FW_DEL is used for deleting single rules or sets, * and (ab)used to atomically manipulate sets. Argument size * is used to distinguish between the two: * sizeof(u_int32_t) * delete single rule or set of rules, * or reassign rules (or sets) to a different set. * 2*sizeof(u_int32_t) * atomic disable/enable sets. * first u_int32_t contains sets to be disabled, * second u_int32_t contains sets to be enabled. */ error = sooptcopyin(sopt, rule_buf, 2*sizeof(u_int32_t), sizeof(u_int32_t)); if (error) break; size = sopt->sopt_valsize; if (size == sizeof(u_int32_t)) /* delete or reassign */ error = del_entry(&layer3_chain, rule_buf[0]); else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */ set_disable = (set_disable | rule_buf[0]) & ~rule_buf[1] & ~(1<<31); /* set 31 always enabled */ else error = EINVAL; break; case IP_FW_ZERO: case IP_FW_RESETLOG: /* argument is an int, the rule number */ rulenum=0; if (sopt->sopt_val != 0) { error = sooptcopyin(sopt, &rulenum, sizeof(int), sizeof(int)); if (error) break; } error = zero_entry(rulenum, sopt->sopt_name == IP_FW_RESETLOG); break; default: printf("ipfw_ctl invalid option %d\n", sopt->sopt_name); error = EINVAL; } return (error); } /** * dummynet needs a reference to the default rule, because rules can be * deleted while packets hold a reference to them. When this happens, * dummynet changes the reference to the default rule (it could well be a * NULL pointer, but this way we do not need to check for the special * case, plus here he have info on the default behaviour). */ struct ip_fw *ip_fw_default_rule; /* * This procedure is only used to handle keepalives. It is invoked * every dyn_keepalive_period */ static void ipfw_tick(void * __unused unused) { int i; int s; ipfw_dyn_rule *q; if (dyn_keepalive == 0 || ipfw_dyn_v == NULL || dyn_count == 0) goto done; s = splimp(); for (i = 0 ; i < curr_dyn_buckets ; i++) { for (q = ipfw_dyn_v[i] ; q ; q = q->next ) { if (q->dyn_type == O_LIMIT_PARENT) continue; if (q->id.proto != IPPROTO_TCP) continue; if ( (q->state & BOTH_SYN) != BOTH_SYN) continue; if (TIME_LEQ( time_second+dyn_keepalive_interval, q->expire)) continue; /* too early */ if (TIME_LEQ(q->expire, time_second)) continue; /* too late, rule expired */ send_pkt(&(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN); send_pkt(&(q->id), q->ack_fwd - 1, q->ack_rev, 0); } } splx(s); done: ipfw_timeout_h = timeout(ipfw_tick, NULL, dyn_keepalive_period*hz); } static void ipfw_init(void) { struct ip_fw default_rule; ip_fw_chk_ptr = ipfw_chk; ip_fw_ctl_ptr = ipfw_ctl; layer3_chain = NULL; bzero(&default_rule, sizeof default_rule); default_rule.act_ofs = 0; default_rule.rulenum = IPFW_DEFAULT_RULE; default_rule.cmd_len = 1; default_rule.set = 31; default_rule.cmd[0].len = 1; default_rule.cmd[0].opcode = #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT 1 ? O_ACCEPT : #endif O_DENY; add_rule(&layer3_chain, &default_rule); ip_fw_default_rule = layer3_chain; printf("ipfw2 initialized, divert %s, " "rule-based forwarding enabled, default to %s, logging ", #ifdef IPDIVERT "enabled", #else "disabled", #endif default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny"); #ifdef IPFIREWALL_VERBOSE fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif if (fw_verbose == 0) printf("disabled\n"); else if (verbose_limit == 0) printf("unlimited\n"); else printf("limited to %d packets/entry by default\n", verbose_limit); bzero(&ipfw_timeout_h, sizeof(struct callout_handle)); ipfw_timeout_h = timeout(ipfw_tick, NULL, hz); } static int ipfw_modevent(module_t mod, int type, void *unused) { int s; int err = 0; switch (type) { case MOD_LOAD: s = splimp(); if (IPFW_LOADED) { splx(s); printf("IP firewall already loaded\n"); err = EEXIST; } else { ipfw_init(); splx(s); } break; case MOD_UNLOAD: #if !defined(KLD_MODULE) printf("ipfw statically compiled, cannot unload\n"); err = EBUSY; #else s = splimp(); untimeout(ipfw_tick, NULL, ipfw_timeout_h); ip_fw_chk_ptr = NULL; ip_fw_ctl_ptr = NULL; free_chain(&layer3_chain, 1 /* kill default rule */); splx(s); printf("IP firewall unloaded\n"); #endif break; default: break; } return err; } static moduledata_t ipfwmod = { "ipfw", ipfw_modevent, 0 }; DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(ipfw, 1); #endif /* IPFW2 */ Index: head/sys/netinet/ip_icmp.c =================================================================== --- head/sys/netinet/ip_icmp.c (revision 105193) +++ head/sys/netinet/ip_icmp.c (revision 105194) @@ -1,876 +1,876 @@ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 * $FreeBSD$ */ #include "opt_ipsec.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #define _IP_VHL #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif #include /* * ICMP routines: error generation, receive packet processing, and * routines to turnaround packets back to the originator, and * host table maintenance routines. */ static struct icmpstat icmpstat; SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW, &icmpstat, icmpstat, ""); static int icmpmaskrepl = 0; SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW, &icmpmaskrepl, 0, ""); static int drop_redirect = 0; SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW, &drop_redirect, 0, ""); static int log_redirect = 0; SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW, &log_redirect, 0, ""); static int icmplim = 200; SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW, &icmplim, 0, ""); static int icmplim_output = 1; SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW, &icmplim_output, 0, ""); /* * ICMP broadcast echo sysctl */ static int icmpbmcastecho = 0; SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW, &icmpbmcastecho, 0, ""); #ifdef ICMPPRINTFS int icmpprintfs = 0; #endif static void icmp_reflect(struct mbuf *); static void icmp_send(struct mbuf *, struct mbuf *, struct route *); static int ip_next_mtu(int, int); extern struct protosw inetsw[]; /* * Generate an error packet of type error * in response to bad packet ip. */ void icmp_error(n, type, code, dest, destifp) struct mbuf *n; int type, code; n_long dest; struct ifnet *destifp; { register struct ip *oip = mtod(n, struct ip *), *nip; register unsigned oiplen = IP_VHL_HL(oip->ip_vhl) << 2; register struct icmp *icp; register struct mbuf *m; unsigned icmplen; #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_error(%p, %x, %d)\n", oip, type, code); #endif if (type != ICMP_REDIRECT) icmpstat.icps_error++; /* * Don't send error if not the first fragment of message. * Don't error if the old packet protocol was ICMP * error message, only known informational types. */ if (oip->ip_off &~ (IP_MF|IP_DF)) goto freeit; if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && n->m_len >= oiplen + ICMP_MINLEN && !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiplen))->icmp_type)) { icmpstat.icps_oldicmp++; goto freeit; } /* Don't send error in response to a multicast or broadcast packet */ if (n->m_flags & (M_BCAST|M_MCAST)) goto freeit; /* * First, formulate icmp message */ m = m_gethdr(M_DONTWAIT, MT_HEADER); if (m == NULL) goto freeit; #ifdef MAC mac_create_mbuf_netlayer(n, m); #endif icmplen = min(oiplen + 8, oip->ip_len); if (icmplen < sizeof(struct ip)) panic("icmp_error: bad length"); m->m_len = icmplen + ICMP_MINLEN; MH_ALIGN(m, m->m_len); icp = mtod(m, struct icmp *); if ((u_int)type > ICMP_MAXTYPE) panic("icmp_error"); icmpstat.icps_outhist[type]++; icp->icmp_type = type; if (type == ICMP_REDIRECT) icp->icmp_gwaddr.s_addr = dest; else { icp->icmp_void = 0; /* * The following assignments assume an overlay with the * zeroed icmp_void field. */ if (type == ICMP_PARAMPROB) { icp->icmp_pptr = code; code = 0; } else if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG && destifp) { icp->icmp_nextmtu = htons(destifp->if_mtu); } } icp->icmp_code = code; m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip); nip = &icp->icmp_ip; /* * Convert fields to network representation. */ nip->ip_len = htons(nip->ip_len); nip->ip_off = htons(nip->ip_off); /* * Now, copy old ip header (without options) * in front of icmp message. */ if (m->m_data - sizeof(struct ip) < m->m_pktdat) panic("icmp len"); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; nip = mtod(m, struct ip *); bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); nip->ip_len = m->m_len; nip->ip_vhl = IP_VHL_BORING; nip->ip_p = IPPROTO_ICMP; nip->ip_tos = 0; icmp_reflect(m); freeit: m_freem(n); } static struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET }; static struct sockaddr_in icmpdst = { sizeof (struct sockaddr_in), AF_INET }; static struct sockaddr_in icmpgw = { sizeof (struct sockaddr_in), AF_INET }; /* * Process a received ICMP message. */ void icmp_input(m, off) register struct mbuf *m; int off; { int hlen = off; register struct icmp *icp; register struct ip *ip = mtod(m, struct ip *); int icmplen = ip->ip_len; register int i; struct in_ifaddr *ia; void (*ctlfunc)(int, struct sockaddr *, void *); int code; /* * Locate icmp structure in mbuf, and check * that not corrupted and of at least minimum length. */ #ifdef ICMPPRINTFS if (icmpprintfs) { char buf[4 * sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_src)); printf("icmp_input from %s to %s, len %d\n", buf, inet_ntoa(ip->ip_dst), icmplen); } #endif if (icmplen < ICMP_MINLEN) { icmpstat.icps_tooshort++; goto freeit; } i = hlen + min(icmplen, ICMP_ADVLENMIN); if (m->m_len < i && (m = m_pullup(m, i)) == 0) { icmpstat.icps_tooshort++; return; } ip = mtod(m, struct ip *); m->m_len -= hlen; m->m_data += hlen; icp = mtod(m, struct icmp *); if (in_cksum(m, icmplen)) { icmpstat.icps_checksum++; goto freeit; } m->m_len += hlen; m->m_data -= hlen; if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { /* * Deliver very specific ICMP type only. */ switch (icp->icmp_type) { case ICMP_UNREACH: case ICMP_TIMXCEED: break; default: goto freeit; } } #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_input, type %d code %d\n", icp->icmp_type, icp->icmp_code); #endif /* * Message type specific processing. */ if (icp->icmp_type > ICMP_MAXTYPE) goto raw; icmpstat.icps_inhist[icp->icmp_type]++; code = icp->icmp_code; switch (icp->icmp_type) { case ICMP_UNREACH: switch (code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_SRCFAIL: case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_ISOLATED: case ICMP_UNREACH_TOSNET: case ICMP_UNREACH_TOSHOST: case ICMP_UNREACH_HOST_PRECEDENCE: case ICMP_UNREACH_PRECEDENCE_CUTOFF: code = PRC_UNREACH_NET; break; case ICMP_UNREACH_NEEDFRAG: code = PRC_MSGSIZE; break; /* * RFC 1122, Sections 3.2.2.1 and 4.2.3.9. * Treat subcodes 2,3 as immediate RST */ case ICMP_UNREACH_PROTOCOL: case ICMP_UNREACH_PORT: code = PRC_UNREACH_PORT; break; case ICMP_UNREACH_NET_PROHIB: case ICMP_UNREACH_HOST_PROHIB: case ICMP_UNREACH_FILTER_PROHIB: code = PRC_UNREACH_ADMIN_PROHIB; break; default: goto badcode; } goto deliver; case ICMP_TIMXCEED: if (code > 1) goto badcode; code += PRC_TIMXCEED_INTRANS; goto deliver; case ICMP_PARAMPROB: if (code > 1) goto badcode; code = PRC_PARAMPROB; goto deliver; case ICMP_SOURCEQUENCH: if (code) goto badcode; code = PRC_QUENCH; deliver: /* * Problem with datagram; advise higher level routines. */ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) { icmpstat.icps_badlen++; goto freeit; } icp->icmp_ip.ip_len = ntohs(icp->icmp_ip.ip_len); /* Discard ICMP's in response to multicast packets */ if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) goto badcode; #ifdef ICMPPRINTFS if (icmpprintfs) printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; #if 1 /* * MTU discovery: * If we got a needfrag and there is a host route to the * original destination, and the MTU is not locked, then * set the MTU in the route to the suggested new value * (if given) and then notify as usual. The ULPs will * notice that the MTU has changed and adapt accordingly. * If no new MTU was suggested, then we guess a new one * less than the current value. If the new MTU is * unreasonably small (arbitrarily set at 296), then * we reset the MTU to the interface value and enable the * lock bit, indicating that we are no longer doing MTU * discovery. */ if (code == PRC_MSGSIZE) { struct rtentry *rt; int mtu; rt = rtalloc1((struct sockaddr *)&icmpsrc, 0, RTF_CLONING | RTF_PRCLONING); if (rt && (rt->rt_flags & RTF_HOST) && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { mtu = ntohs(icp->icmp_nextmtu); if (!mtu) mtu = ip_next_mtu(rt->rt_rmx.rmx_mtu, 1); #ifdef DEBUG_MTUDISC printf("MTU for %s reduced to %d\n", inet_ntoa(icmpsrc.sin_addr), mtu); #endif if (mtu < 296) { /* rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; */ rt->rt_rmx.rmx_locks |= RTV_MTU; } else if (rt->rt_rmx.rmx_mtu > mtu) { rt->rt_rmx.rmx_mtu = mtu; } } if (rt) RTFREE(rt); } #endif /* * XXX if the packet contains [IPv4 AH TCP], we can't make a * notification to TCP layer. */ ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput; if (ctlfunc) (*ctlfunc)(code, (struct sockaddr *)&icmpsrc, (void *)&icp->icmp_ip); break; badcode: icmpstat.icps_badcode++; break; case ICMP_ECHO: if (!icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { icmpstat.icps_bmcastecho++; break; } icp->icmp_type = ICMP_ECHOREPLY; if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0) goto freeit; else goto reflect; case ICMP_TSTAMP: if (!icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { icmpstat.icps_bmcasttstamp++; break; } if (icmplen < ICMP_TSLEN) { icmpstat.icps_badlen++; break; } icp->icmp_type = ICMP_TSTAMPREPLY; icp->icmp_rtime = iptime(); icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0) goto freeit; else goto reflect; case ICMP_MASKREQ: if (icmpmaskrepl == 0) break; /* * We are not able to respond with all ones broadcast * unless we receive it over a point-to-point interface. */ if (icmplen < ICMP_MASKLEN) break; switch (ip->ip_dst.s_addr) { case INADDR_BROADCAST: case INADDR_ANY: icmpdst.sin_addr = ip->ip_src; break; default: icmpdst.sin_addr = ip->ip_dst; } ia = (struct in_ifaddr *)ifaof_ifpforaddr( (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); if (ia == 0) break; if (ia->ia_ifp == 0) break; icp->icmp_type = ICMP_MASKREPLY; icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; if (ip->ip_src.s_addr == 0) { if (ia->ia_ifp->if_flags & IFF_BROADCAST) ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr; else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; } reflect: ip->ip_len += hlen; /* since ip_input deducts this */ icmpstat.icps_reflect++; icmpstat.icps_outhist[icp->icmp_type]++; icmp_reflect(m); return; case ICMP_REDIRECT: if (log_redirect) { u_long src, dst, gw; src = ntohl(ip->ip_src.s_addr); dst = ntohl(icp->icmp_ip.ip_dst.s_addr); gw = ntohl(icp->icmp_gwaddr.s_addr); printf("icmp redirect from %d.%d.%d.%d: " "%d.%d.%d.%d => %d.%d.%d.%d\n", (int)(src >> 24), (int)((src >> 16) & 0xff), (int)((src >> 8) & 0xff), (int)(src & 0xff), (int)(dst >> 24), (int)((dst >> 16) & 0xff), (int)((dst >> 8) & 0xff), (int)(dst & 0xff), (int)(gw >> 24), (int)((gw >> 16) & 0xff), (int)((gw >> 8) & 0xff), (int)(gw & 0xff)); } if (drop_redirect) break; if (code > 3) goto badcode; if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) { icmpstat.icps_badlen++; break; } /* * Short circuit routing redirects to force * immediate change in the kernel's routing * tables. The message is also handed to anyone * listening on a raw socket (e.g. the routing * daemon for use in updating its tables). */ icmpgw.sin_addr = ip->ip_src; icmpdst.sin_addr = icp->icmp_gwaddr; #ifdef ICMPPRINTFS if (icmpprintfs) { char buf[4 * sizeof "123"]; strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst)); printf("redirect dst %s to %s\n", buf, inet_ntoa(icp->icmp_gwaddr)); } #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; rtredirect((struct sockaddr *)&icmpsrc, (struct sockaddr *)&icmpdst, (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST, (struct sockaddr *)&icmpgw, (struct rtentry **)0); pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); #ifdef IPSEC key_sa_routechange((struct sockaddr *)&icmpsrc); #endif break; /* * No kernel processing for the following; * just fall through to send to raw listener. */ case ICMP_ECHOREPLY: case ICMP_ROUTERADVERT: case ICMP_ROUTERSOLICIT: case ICMP_TSTAMPREPLY: case ICMP_IREQREPLY: case ICMP_MASKREPLY: default: break; } raw: rip_input(m, off); return; freeit: m_freem(m); } /* * Reflect the ip packet back to the source */ static void icmp_reflect(m) struct mbuf *m; { struct ip *ip = mtod(m, struct ip *); struct ifaddr *ifa; struct in_ifaddr *ia; struct in_addr t; struct mbuf *opts = 0; int optlen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip); struct route *ro = NULL, rt; if (!in_canforward(ip->ip_src) && ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) != (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) { m_freem(m); /* Bad return address */ icmpstat.icps_badaddr++; goto done; /* Ip_output() will check for broadcast */ } t = ip->ip_dst; ip->ip_dst = ip->ip_src; ro = &rt; bzero(ro, sizeof(*ro)); /* * If the incoming packet was addressed directly to us, * use dst as the src for the reply. Otherwise (broadcast * or anonymous), use the address which corresponds * to the incoming interface. */ LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) goto match; if (m->m_pkthdr.rcvif != NULL && m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) { TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == t.s_addr) goto match; } } ia = ip_rtaddr(ip->ip_dst, ro); /* We need a route to do anything useful. */ if (ia == NULL) { m_freem(m); icmpstat.icps_noroute++; goto done; } match: t = IA_SIN(ia)->sin_addr; ip->ip_src = t; ip->ip_ttl = ip_defttl; if (optlen > 0) { register u_char *cp; int opt, cnt; u_int len; /* * Retrieve any source routing from the incoming packet; * add on any record-route or timestamp options. */ cp = (u_char *) (ip + 1); if ((opts = ip_srcroute()) == 0 && (opts = m_gethdr(M_DONTWAIT, MT_HEADER))) { opts->m_len = sizeof(struct in_addr); mtod(opts, struct in_addr *)->s_addr = 0; } if (opts) { #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_reflect optlen %d rt %d => ", optlen, opts->m_len); #endif for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) len = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) break; len = cp[IPOPT_OLEN]; if (len < IPOPT_OLEN + sizeof(*cp) || len > cnt) break; } /* * Should check for overflow, but it "can't happen" */ if (opt == IPOPT_RR || opt == IPOPT_TS || opt == IPOPT_SECURITY) { bcopy((caddr_t)cp, mtod(opts, caddr_t) + opts->m_len, len); opts->m_len += len; } } /* Terminate & pad, if necessary */ cnt = opts->m_len % 4; if (cnt) { for (; cnt < 4; cnt++) { *(mtod(opts, caddr_t) + opts->m_len) = IPOPT_EOL; opts->m_len++; } } #ifdef ICMPPRINTFS if (icmpprintfs) printf("%d\n", opts->m_len); #endif } /* * Now strip out original options by copying rest of first * mbuf's data back, and adjust the IP length. */ ip->ip_len -= optlen; ip->ip_vhl = IP_VHL_BORING; m->m_len -= optlen; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len -= optlen; optlen += sizeof(struct ip); bcopy((caddr_t)ip + optlen, (caddr_t)(ip + 1), (unsigned)(m->m_len - sizeof(struct ip))); } m->m_flags &= ~(M_BCAST|M_MCAST); icmp_send(m, opts, ro); done: if (opts) (void)m_free(opts); if (ro && ro->ro_rt) RTFREE(ro->ro_rt); } /* * Send an icmp packet back to the ip level, * after supplying a checksum. */ static void icmp_send(m, opts, rt) register struct mbuf *m; struct mbuf *opts; struct route *rt; { register struct ip *ip = mtod(m, struct ip *); register int hlen; register struct icmp *icp; hlen = IP_VHL_HL(ip->ip_vhl) << 2; m->m_data += hlen; m->m_len -= hlen; icp = mtod(m, struct icmp *); icp->icmp_cksum = 0; icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen); m->m_data -= hlen; m->m_len += hlen; m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef ICMPPRINTFS if (icmpprintfs) { char buf[4 * sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_dst)); printf("icmp_send dst %s src %s\n", buf, inet_ntoa(ip->ip_src)); } #endif - (void) ip_output(m, opts, rt, 0, NULL); + (void) ip_output(m, opts, rt, 0, NULL, NULL); } n_time iptime() { struct timeval atv; u_long t; getmicrotime(&atv); t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000; return (htonl(t)); } #if 1 /* * Return the next larger or smaller MTU plateau (table from RFC 1191) * given current value MTU. If DIR is less than zero, a larger plateau * is returned; otherwise, a smaller value is returned. */ static int ip_next_mtu(mtu, dir) int mtu; int dir; { static int mtutab[] = { 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1006, 508, 296, 68, 0 }; int i; for (i = 0; i < (sizeof mtutab) / (sizeof mtutab[0]); i++) { if (mtu >= mtutab[i]) break; } if (dir < 0) { if (i == 0) { return 0; } else { return mtutab[i - 1]; } } else { if (mtutab[i] == 0) { return 0; } else if(mtu > mtutab[i]) { return mtutab[i]; } else { return mtutab[i + 1]; } } } #endif /* * badport_bandlim() - check for ICMP bandwidth limit * * Return 0 if it is ok to send an ICMP error response, -1 if we have * hit our bandwidth limit and it is not ok. * * If icmplim is <= 0, the feature is disabled and 0 is returned. * * For now we separate the TCP and UDP subsystems w/ different 'which' * values. We may eventually remove this separation (and simplify the * code further). * * Note that the printing of the error message is delayed so we can * properly print the icmp error rate that the system was trying to do * (i.e. 22000/100 pps, etc...). This can cause long delays in printing * the 'final' error, but it doesn't make sense to solve the printing * delay with more complex code. */ int badport_bandlim(int which) { static int lticks[BANDLIM_MAX + 1]; static int lpackets[BANDLIM_MAX + 1]; int dticks; const char *bandlimittype[] = { "Limiting icmp unreach response", "Limiting icmp ping response", "Limiting icmp tstamp response", "Limiting closed port RST response", "Limiting open port RST response" }; /* * Return ok status if feature disabled or argument out of * ranage. */ if (icmplim <= 0 || which > BANDLIM_MAX || which < 0) return(0); dticks = ticks - lticks[which]; /* * reset stats when cumulative dt exceeds one second. */ if ((unsigned int)dticks > hz) { if (lpackets[which] > icmplim && icmplim_output) { printf("%s from %d to %d packets per second\n", bandlimittype[which], lpackets[which], icmplim ); } lticks[which] = ticks; lpackets[which] = 0; } /* * bump packet count */ if (++lpackets[which] > icmplim) { return(-1); } return(0); } Index: head/sys/netinet/ip_input.c =================================================================== --- head/sys/netinet/ip_input.c (revision 105193) +++ head/sys/netinet/ip_input.c (revision 105194) @@ -1,1984 +1,1984 @@ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 * $FreeBSD$ */ #define _IP_VHL #include "opt_bootp.h" #include "opt_ipfw.h" #include "opt_ipdn.h" #include "opt_ipdivert.h" #include "opt_ipfilter.h" #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_pfil_hooks.h" #include "opt_random_ip_id.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif int rsvp_on = 0; int ipforwarding = 0; SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW, &ipforwarding, 0, "Enable IP forwarding between interfaces"); static int ipsendredirects = 1; /* XXX */ SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, &ipsendredirects, 0, "Enable sending IP redirects"); int ip_defttl = IPDEFTTL; SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, &ip_defttl, 0, "Maximum TTL on IP packets"); static int ip_dosourceroute = 0; SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW, &ip_dosourceroute, 0, "Enable forwarding source routed IP packets"); static int ip_acceptsourceroute = 0; SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, CTLFLAG_RW, &ip_acceptsourceroute, 0, "Enable accepting source routed IP packets"); static int ip_keepfaith = 0; SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, &ip_keepfaith, 0, "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); static int ip_nfragpackets = 0; static int ip_maxfragpackets; /* initialized in ip_init() */ SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_RW, &ip_maxfragpackets, 0, "Maximum number of IPv4 fragment reassembly queue entries"); /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table * and transmit implementation do not implement the Strong ES model, * setting this to 1 results in an odd hybrid. * * XXX - ip_checkinterface currently must be disabled if you use ipnat * to translate the destination address to another local interface. * * XXX - ip_checkinterface must be disabled if you add IP aliases * to the loopback interface instead of the interface where the * packets for those addresses are received. */ static int ip_checkinterface = 1; SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, &ip_checkinterface, 0, "Verify packet arrives on correct interface"); #ifdef DIAGNOSTIC static int ipprintfs = 0; #endif static int ipqmaxlen = IFQ_MAXLEN; extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; struct in_ifaddrhead in_ifaddrhead; /* first inet address */ struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ u_long in_ifaddrhmask; /* mask for hash table */ SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW, &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue"); SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD, &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue"); struct ipstat ipstat; SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW, &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); /* Packet reassembly stuff */ #define IPREASS_NHASH_LOG2 6 #define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) #define IPREASS_HMASK (IPREASS_NHASH - 1) #define IPREASS_HASH(x,y) \ (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; static int nipq = 0; /* total # of reass queues */ static int maxnipq; #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); #endif #ifdef IPSTEALTH static int ipstealth = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, &ipstealth, 0, ""); #endif /* Firewall hooks */ ip_fw_chk_t *ip_fw_chk_ptr; int fw_enable = 1 ; /* Dummynet hooks */ ip_dn_io_t *ip_dn_io_ptr; /* * XXX this is ugly -- the following two global variables are * used to store packet state while it travels through the stack. * Note that the code even makes assumptions on the size and * alignment of fields inside struct ip_srcrt so e.g. adding some * fields will break the code. This needs to be fixed. * * We need to save the IP options in case a protocol wants to respond * to an incoming packet over the same route if the packet got here * using IP source routing. This allows connection establishment and * maintenance when the remote end is on a network that is not known * to us. */ static int ip_nhops = 0; static struct ip_srcrt { struct in_addr dst; /* final destination */ char nop; /* one NOP to align */ char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */ struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)]; } ip_srcrt; static void save_rte(u_char *, struct in_addr); static int ip_dooptions(struct mbuf *m, int, struct sockaddr_in *next_hop); static void ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop); static void ip_freef(struct ipqhead *, struct ipq *); static struct mbuf *ip_reass(struct mbuf *, struct ipqhead *, struct ipq *, u_int32_t *, u_int16_t *); static void ipintr(void); /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. */ void ip_init() { register struct protosw *pr; register int i; TAILQ_INIT(&in_ifaddrhead); in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &in_ifaddrhmask); pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == 0) panic("ip_init"); for (i = 0; i < IPPROTO_MAX; i++) ip_protox[i] = pr - inetsw; for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) ip_protox[pr->pr_protocol] = pr - inetsw; for (i = 0; i < IPREASS_NHASH; i++) TAILQ_INIT(&ipq[i]); maxnipq = nmbclusters / 4; ip_maxfragpackets = nmbclusters / 4; #ifndef RANDOM_IP_ID ip_id = time_second & 0xffff; #endif ipintrq.ifq_maxlen = ipqmaxlen; mtx_init(&ipintrq.ifq_mtx, "ip_inq", NULL, MTX_DEF); ipintrq_present = 1; register_netisr(NETISR_IP, ipintr); } /* * XXX watch out this one. It is perhaps used as a cache for * the most recently used route ? it is cleared in in_addroute() * when a new route is successfully created. */ struct route ipforward_rt; /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. */ void ip_input(struct mbuf *m) { struct ip *ip; struct ipq *fp; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; int i, hlen, checkif; u_short sum; struct in_addr pkt_dst; u_int32_t divert_info = 0; /* packet divert/tee info */ struct ip_fw_args args; #ifdef PFIL_HOOKS struct packet_filter_hook *pfh; struct mbuf *m0; int rv; #endif /* PFIL_HOOKS */ args.eh = NULL; args.oif = NULL; args.rule = NULL; args.divert_rule = 0; /* divert cookie */ args.next_hop = NULL; /* Grab info from MT_TAG mbufs prepended to the chain. */ for (; m && m->m_type == MT_TAG; m = m->m_next) { - switch(m->m_tag_id) { + switch(m->_m_tag_id) { default: printf("ip_input: unrecognised MT_TAG tag %d\n", - m->m_tag_id); + m->_m_tag_id); break; case PACKET_TAG_DUMMYNET: args.rule = ((struct dn_pkt *)m)->rule; break; case PACKET_TAG_DIVERT: args.divert_rule = (intptr_t)m->m_hdr.mh_data & 0xffff; break; case PACKET_TAG_IPFORWARD: args.next_hop = (struct sockaddr_in *)m->m_hdr.mh_data; break; } } KASSERT(m != NULL && (m->m_flags & M_PKTHDR) != 0, ("ip_input: no HDR")); if (args.rule) { /* dummynet already filtered us */ ip = mtod(m, struct ip *); hlen = IP_VHL_HL(ip->ip_vhl) << 2; goto iphack ; } ipstat.ips_total++; if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == 0) { ipstat.ips_toosmall++; return; } ip = mtod(m, struct ip *); if (IP_VHL_V(ip->ip_vhl) != IPVERSION) { ipstat.ips_badvers++; goto bad; } hlen = IP_VHL_HL(ip->ip_vhl) << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ ipstat.ips_badhlen++; goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == 0) { ipstat.ips_badhlen++; return; } ip = mtod(m, struct ip *); } /* 127/8 must not appear on wire - RFC1122 */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { ipstat.ips_badaddr++; goto bad; } } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { ipstat.ips_badsum++; goto bad; } /* * Convert fields to host representation. */ ip->ip_len = ntohs(ip->ip_len); if (ip->ip_len < hlen) { ipstat.ips_badlen++; goto bad; } ip->ip_off = ntohs(ip->ip_off); /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < ip->ip_len) { tooshort: ipstat.ips_tooshort++; goto bad; } if (m->m_pkthdr.len > ip->ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip->ip_len; m->m_pkthdr.len = ip->ip_len; } else m_adj(m, ip->ip_len - m->m_pkthdr.len); } #ifdef IPSEC if (ipsec_gethist(m, NULL)) goto pass; #endif /* * IpHack's section. * Right now when no processing on packet has done * and it is still fresh out of network we do our black * deals with it. * - Firewall: deny/allow/divert * - Xlate: translate packet's addr/port (NAT). * - Pipe: pass pkt through dummynet. * - Wrap: fake packet's addr/port * - Encapsulate: put it in another IP and send out. */ iphack: #ifdef PFIL_HOOKS /* * Run through list of hooks for input packets. If there are any * filters which require that additional packets in the flow are * not fast-forwarded, they must clear the M_CANFASTFWD flag. * Note that filters must _never_ set this flag, as another filter * in the list may have previously cleared it. */ m0 = m; pfh = pfil_hook_get(PFIL_IN, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh); for (; pfh; pfh = TAILQ_NEXT(pfh, pfil_link)) if (pfh->pfil_func) { rv = pfh->pfil_func(ip, hlen, m->m_pkthdr.rcvif, 0, &m0); if (rv) return; m = m0; if (m == NULL) return; ip = mtod(m, struct ip *); } #endif /* PFIL_HOOKS */ if (fw_enable && IPFW_LOADED) { /* * If we've been forwarded from the output side, then * skip the firewall a second time */ if (args.next_hop) goto ours; args.m = m; i = ip_fw_chk_ptr(&args); m = args.m; if ( (i & IP_FW_PORT_DENY_FLAG) || m == NULL) { /* drop */ if (m) m_freem(m); return; } ip = mtod(m, struct ip *); /* just in case m changed */ if (i == 0 && args.next_hop == NULL) /* common case */ goto pass; if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG) != 0) { /* Send packet to the appropriate pipe */ ip_dn_io_ptr(m, i&0xffff, DN_TO_IP_IN, &args); return; } #ifdef IPDIVERT if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) { /* Divert or tee packet */ divert_info = i; goto ours; } #endif if (i == 0 && args.next_hop != NULL) goto pass; /* * if we get here, the packet must be dropped */ m_freem(m); return; } pass: /* * Process options and, if not destined for us, * ship it on. ip_dooptions returns 1 when an * error was detected (causing an icmp message * to be sent and the original packet to be freed). */ ip_nhops = 0; /* for source routed packets */ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, args.next_hop)) return; /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no * matter if it is destined to another node, or whether it is * a multicast one, RSVP wants it! and prevents it from being forwarded * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ if (rsvp_on && ip->ip_p==IPPROTO_RSVP) goto ours; /* * Check our list of addresses, to see if the packet is for us. * If we don't have any addresses, assume any unicast packet * we receive might be for us (and let the upper layers deal * with it). */ if (TAILQ_EMPTY(&in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; /* * Cache the destination address of the packet; this may be * changed by use of 'ipfw fwd'. */ pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst; /* * Enable a consistency check between the destination address * and the arrival interface for a unicast packet (the RFC 1122 * strong ES model) if IP forwarding is disabled and the packet * is not locally generated and the packet is not subject to * 'ipfw fwd'. * * XXX - Checking also should be disabled if the destination * address is ipnat'ed to a different interface. * * XXX - Checking is incompatible with IP aliases added * to the loopback interface instead of the interface where * the packets are received. */ checkif = ip_checkinterface && (ipforwarding == 0) && m->m_pkthdr.rcvif != NULL && ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) && (args.next_hop == NULL); /* * Check for exact addresses in the hash bucket. */ LIST_FOREACH(ia, INADDR_HASH(pkt_dst.s_addr), ia_hash) { /* * If the address matches, verify that the packet * arrived via the correct interface if checking is * enabled. */ if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr && (!checkif || ia->ia_ifp == m->m_pkthdr.rcvif)) goto ours; } /* * Check for broadcast addresses. * * Only accept broadcast packets that arrive via the matching * interface. Reception of forwarded directed broadcasts would * be handled via ip_forward() and ether_output() with the loopback * into the stack for SIMPLEX interfaces handled by ether_output(). */ if (m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) { TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == pkt_dst.s_addr) goto ours; if (ia->ia_netbroadcast.s_addr == pkt_dst.s_addr) goto ours; #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) goto ours; #endif } } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct in_multi *inm; if (ip_mrouter) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) { ipstat.ips_cantforward++; m_freem(m); return; } /* * The process-level routing daemon needs to receive * all multicast IGMP packets, whether or not this * host belongs to their destination groups. */ if (ip->ip_p == IPPROTO_IGMP) goto ours; ipstat.ips_forward++; } /* * See if we belong to the destination multicast group on the * arrival interface. */ IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm); if (inm == NULL) { ipstat.ips_notmember++; m_freem(m); return; } goto ours; } if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) goto ours; if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; /* * FAITH(Firewall Aided Internet Translator) */ if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { if (ip_keepfaith) { if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) goto ours; } m_freem(m); return; } /* * Not for us; forward if possible and desirable. */ if (ipforwarding == 0) { ipstat.ips_cantforward++; m_freem(m); } else { #ifdef IPSEC /* * Enforce inbound IPsec SPD. */ if (ipsec4_in_reject(m, NULL)) { ipsecstat.in_polvio++; goto bad; } #endif /* IPSEC */ ip_forward(m, 0, args.next_hop); } return; ours: #ifdef IPSTEALTH /* * IPSTEALTH: Process non-routing options only * if the packet is destined for us. */ if (ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1, args.next_hop)) return; #endif /* IPSTEALTH */ /* Count the packet in the ip address stats */ if (ia != NULL) { ia->ia_ifa.if_ipackets++; ia->ia_ifa.if_ibytes += m->m_pkthdr.len; } /* * If offset or IP_MF are set, must reassemble. * Otherwise, nothing need be done. * (We could look in the reassembly queue to see * if the packet was previously fragmented, * but it's not worth the time; just let them time out.) */ if (ip->ip_off & (IP_MF | IP_OFFMASK)) { sum = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); /* * Look for queue of fragments * of this datagram. */ TAILQ_FOREACH(fp, &ipq[sum], ipq_list) if (ip->ip_id == fp->ipq_id && ip->ip_src.s_addr == fp->ipq_src.s_addr && ip->ip_dst.s_addr == fp->ipq_dst.s_addr && #ifdef MAC mac_fragment_match(m, fp) && #endif ip->ip_p == fp->ipq_p) goto found; fp = 0; /* check if there's a place for the new queue */ if (nipq > maxnipq) { /* * drop something from the tail of the current queue * before proceeding further */ struct ipq *q = TAILQ_LAST(&ipq[sum], ipqhead); if (q == NULL) { /* gak */ for (i = 0; i < IPREASS_NHASH; i++) { struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead); if (r) { ip_freef(&ipq[i], r); break; } } } else ip_freef(&ipq[sum], q); } found: /* * Adjust ip_len to not reflect header, * convert offset of this to bytes. */ ip->ip_len -= hlen; if (ip->ip_off & IP_MF) { /* * Make sure that fragments have a data length * that's a non-zero multiple of 8 bytes. */ if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { ipstat.ips_toosmall++; /* XXX */ goto bad; } m->m_flags |= M_FRAG; } else m->m_flags &= ~M_FRAG; ip->ip_off <<= 3; /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf, and update * the divert info in divert_info and args.divert_rule. */ ipstat.ips_fragments++; m->m_pkthdr.header = ip; m = ip_reass(m, &ipq[sum], fp, &divert_info, &args.divert_rule); if (m == 0) return; ipstat.ips_reassembled++; ip = mtod(m, struct ip *); /* Get the header length of the reassembled packet */ hlen = IP_VHL_HL(ip->ip_vhl) << 2; #ifdef IPDIVERT /* Restore original checksum before diverting packet */ if (divert_info != 0) { ip->ip_len += hlen; ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(m, hlen); ip->ip_off = ntohs(ip->ip_off); ip->ip_len = ntohs(ip->ip_len); ip->ip_len -= hlen; } #endif } else ip->ip_len -= hlen; #ifdef IPDIVERT /* * Divert or tee packet to the divert protocol if required. */ if (divert_info != 0) { struct mbuf *clone = NULL; /* Clone packet if we're doing a 'tee' */ if ((divert_info & IP_FW_PORT_TEE_FLAG) != 0) clone = m_dup(m, M_DONTWAIT); /* Restore packet header fields to original values */ ip->ip_len += hlen; ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); /* Deliver packet to divert input routine */ divert_packet(m, 1, divert_info & 0xffff, args.divert_rule); ipstat.ips_delivered++; /* If 'tee', continue with original packet */ if (clone == NULL) return; m = clone; ip = mtod(m, struct ip *); ip->ip_len += hlen; /* * Jump backwards to complete processing of the * packet. But first clear divert_info to avoid * entering this block again. * We do not need to clear args.divert_rule * or args.next_hop as they will not be used. */ divert_info = 0; goto pass; } #endif #ifdef IPSEC /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0 && ipsec4_in_reject(m, NULL)) { ipsecstat.in_polvio++; goto bad; } #endif /* * Switch out to protocol's input routine. */ ipstat.ips_delivered++; if (args.next_hop && ip->ip_p == IPPROTO_TCP) { /* TCP needs IPFORWARD info if available */ struct m_hdr tag; tag.mh_type = MT_TAG; tag.mh_flags = PACKET_TAG_IPFORWARD; tag.mh_data = (caddr_t)args.next_hop; tag.mh_next = m; (*inetsw[ip_protox[ip->ip_p]].pr_input)( (struct mbuf *)&tag, hlen); } else (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); return; bad: m_freem(m); } /* * IP software interrupt routine - to go away sometime soon */ static void ipintr(void) { struct mbuf *m; while (1) { IF_DEQUEUE(&ipintrq, m); if (m == 0) return; ip_input(m); } } /* * Take incoming datagram fragment and try to reassemble it into * whole datagram. If a chain for reassembly of this datagram already * exists, then it is given as fp; otherwise have to make a chain. * * When IPDIVERT enabled, keep additional state with each packet that * tells us if we need to divert or tee the packet we're building. * In particular, *divinfo includes the port and TEE flag, * *divert_rule is the number of the matching rule. */ static struct mbuf * ip_reass(struct mbuf *m, struct ipqhead *head, struct ipq *fp, u_int32_t *divinfo, u_int16_t *divert_rule) { struct ip *ip = mtod(m, struct ip *); register struct mbuf *p, *q, *nq; struct mbuf *t; int hlen = IP_VHL_HL(ip->ip_vhl) << 2; int i, next; /* * Presence of header sizes in mbufs * would confuse code below. */ m->m_data += hlen; m->m_len -= hlen; /* * If first fragment to arrive, create a reassembly queue. */ if (fp == 0) { /* * Enforce upper bound on number of fragmented packets * for which we attempt reassembly; * If maxfrag is 0, never accept fragments. * If maxfrag is -1, accept all fragments without limitation. */ if ((ip_maxfragpackets >= 0) && (ip_nfragpackets >= ip_maxfragpackets)) goto dropfrag; ip_nfragpackets++; if ((t = m_get(M_DONTWAIT, MT_FTABLE)) == NULL) goto dropfrag; fp = mtod(t, struct ipq *); #ifdef MAC mac_init_ipq(fp); mac_create_ipq(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); nipq++; fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; fp->ipq_src = ip->ip_src; fp->ipq_dst = ip->ip_dst; fp->ipq_frags = m; m->m_nextpkt = NULL; #ifdef IPDIVERT fp->ipq_div_info = 0; fp->ipq_div_cookie = 0; #endif goto inserted; } else { #ifdef MAC mac_update_ipq(m, fp); #endif } #define GETIP(m) ((struct ip*)((m)->m_pkthdr.header)) /* * Find a segment which begins after this one does. */ for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) if (GETIP(q)->ip_off > ip->ip_off) break; /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us, otherwise * stick new segment in the proper place. * * If some of the data is dropped from the the preceding * segment, then it's checksum is invalidated. */ if (p) { i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off; if (i > 0) { if (i >= ip->ip_len) goto dropfrag; m_adj(m, i); m->m_pkthdr.csum_flags = 0; ip->ip_off += i; ip->ip_len -= i; } m->m_nextpkt = p->m_nextpkt; p->m_nextpkt = m; } else { m->m_nextpkt = fp->ipq_frags; fp->ipq_frags = m; } /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off; q = nq) { i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off; if (i < GETIP(q)->ip_len) { GETIP(q)->ip_len -= i; GETIP(q)->ip_off += i; m_adj(q, i); q->m_pkthdr.csum_flags = 0; break; } nq = q->m_nextpkt; m->m_nextpkt = nq; m_freem(q); } inserted: #ifdef IPDIVERT /* * Transfer firewall instructions to the fragment structure. * Only trust info in the fragment at offset 0. */ if (ip->ip_off == 0) { fp->ipq_div_info = *divinfo; fp->ipq_div_cookie = *divert_rule; } *divinfo = 0; *divert_rule = 0; #endif /* * Check for complete reassembly. */ next = 0; for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (GETIP(q)->ip_off != next) return (0); next += GETIP(q)->ip_len; } /* Make sure the last packet didn't have the IP_MF flag */ if (p->m_flags & M_FRAG) return (0); /* * Reassembly is complete. Make sure the packet is a sane size. */ q = fp->ipq_frags; ip = GETIP(q); if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) { ipstat.ips_toolong++; ip_freef(head, fp); return (0); } /* * Concatenate fragments. */ m = q; t = m->m_next; m->m_next = 0; m_cat(m, t); nq = q->m_nextpkt; q->m_nextpkt = 0; for (q = nq; q != NULL; q = nq) { nq = q->m_nextpkt; q->m_nextpkt = NULL; m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; m_cat(m, q); } #ifdef MAC mac_create_datagram_from_ipq(fp, m); mac_destroy_ipq(fp); #endif #ifdef IPDIVERT /* * Extract firewall instructions from the fragment structure. */ *divinfo = fp->ipq_div_info; *divert_rule = fp->ipq_div_cookie; #endif /* * Create header for new ip packet by * modifying header of first packet; * dequeue and discard fragment reassembly header. * Make header visible. */ ip->ip_len = next; ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; TAILQ_REMOVE(head, fp, ipq_list); nipq--; (void) m_free(dtom(fp)); ip_nfragpackets--; m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2); m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2); /* some debugging cruft by sklower, below, will go away soon */ if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ m_fixhdr(m); return (m); dropfrag: #ifdef IPDIVERT *divinfo = 0; *divert_rule = 0; #endif ipstat.ips_fragdropped++; m_freem(m); return (0); #undef GETIP } /* * Free a fragment reassembly header and all * associated datagrams. */ static void ip_freef(fhp, fp) struct ipqhead *fhp; struct ipq *fp; { register struct mbuf *q; while (fp->ipq_frags) { q = fp->ipq_frags; fp->ipq_frags = q->m_nextpkt; m_freem(q); } TAILQ_REMOVE(fhp, fp, ipq_list); (void) m_free(dtom(fp)); ip_nfragpackets--; nipq--; } /* * IP timer processing; * if a timer expires on a reassembly * queue, discard it. */ void ip_slowtimo() { register struct ipq *fp; int s = splnet(); int i; for (i = 0; i < IPREASS_NHASH; i++) { for(fp = TAILQ_FIRST(&ipq[i]); fp;) { struct ipq *fpp; fpp = fp; fp = TAILQ_NEXT(fp, ipq_list); if(--fpp->ipq_ttl == 0) { ipstat.ips_fragtimeout++; ip_freef(&ipq[i], fpp); } } } /* * If we are over the maximum number of fragments * (due to the limit being lowered), drain off * enough to get down to the new limit. */ for (i = 0; i < IPREASS_NHASH; i++) { if (ip_maxfragpackets >= 0) { while (ip_nfragpackets > ip_maxfragpackets && !TAILQ_EMPTY(&ipq[i])) { ipstat.ips_fragdropped++; ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); } } } ipflow_slowtimo(); splx(s); } /* * Drain off all datagram fragments. */ void ip_drain() { int i; for (i = 0; i < IPREASS_NHASH; i++) { while(!TAILQ_EMPTY(&ipq[i])) { ipstat.ips_fragdropped++; ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); } } in_rtqdrain(); } /* * Do option processing on a datagram, * possibly discarding it if bad options are encountered, * or forwarding it if source-routed. * The pass argument is used when operating in the IPSTEALTH * mode to tell what options to process: * [LS]SRR (pass 0) or the others (pass 1). * The reason for as many as two passes is that when doing IPSTEALTH, * non-routing options should be processed only if the packet is for us. * Returns 1 if packet has been forwarded/freed, * 0 if the packet should be processed further. */ static int ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop) { struct ip *ip = mtod(m, struct ip *); u_char *cp; struct in_ifaddr *ia; int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; struct in_addr *sin, dst; n_time ntime; struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; dst = ip->ip_dst; cp = (u_char *)(ip + 1); cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } } switch (opt) { default: break; /* * Source routing with record. * Find interface with current destination address. * If none on this machine then drop if strictly routed, * or do nothing if loosely routed. * Record interface address and bring up next address * component. If strictly routed make sure next * address is on directly accessible net. */ case IPOPT_LSRR: case IPOPT_SSRR: #ifdef IPSTEALTH if (ipstealth && pass > 0) break; #endif if (optlen < IPOPT_OFFSET + sizeof(*cp)) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } ipaddr.sin_addr = ip->ip_dst; ia = (struct in_ifaddr *) ifa_ifwithaddr((struct sockaddr *)&ipaddr); if (ia == 0) { if (opt == IPOPT_SSRR) { type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } if (!ip_dosourceroute) goto nosourcerouting; /* * Loose routing, and not at next destination * yet; nothing to do except forward. */ break; } off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) { /* * End of source route. Should be for us. */ if (!ip_acceptsourceroute) goto nosourcerouting; save_rte(cp, ip->ip_src); break; } #ifdef IPSTEALTH if (ipstealth) goto dropit; #endif if (!ip_dosourceroute) { if (ipforwarding) { char buf[16]; /* aaa.bbb.ccc.ddd\0 */ /* * Acting as a router, so generate ICMP */ nosourcerouting: strcpy(buf, inet_ntoa(ip->ip_dst)); log(LOG_WARNING, "attempted source route from %s to %s\n", inet_ntoa(ip->ip_src), buf); type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } else { /* * Not acting as a router, so silently drop. */ #ifdef IPSTEALTH dropit: #endif ipstat.ips_cantforward++; m_freem(m); return (1); } } /* * locate outgoing interface */ (void)memcpy(&ipaddr.sin_addr, cp + off, sizeof(ipaddr.sin_addr)); if (opt == IPOPT_SSRR) { #define INA struct in_ifaddr * #define SA struct sockaddr * if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0) ia = (INA)ifa_ifwithnet((SA)&ipaddr); } else ia = ip_rtaddr(ipaddr.sin_addr, &ipforward_rt); if (ia == 0) { type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } ip->ip_dst = ipaddr.sin_addr; (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); cp[IPOPT_OFFSET] += sizeof(struct in_addr); /* * Let ip_intr's mcast routing check handle mcast pkts */ forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); break; case IPOPT_RR: #ifdef IPSTEALTH if (ipstealth && pass == 0) break; #endif if (optlen < IPOPT_OFFSET + sizeof(*cp)) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } /* * If no space remains, ignore. */ off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) break; (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst, sizeof(ipaddr.sin_addr)); /* * locate outgoing interface; if we're the destination, * use the incoming interface (should be same). */ if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0 && (ia = ip_rtaddr(ipaddr.sin_addr, &ipforward_rt)) == 0) { type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; goto bad; } (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); cp[IPOPT_OFFSET] += sizeof(struct in_addr); break; case IPOPT_TS: #ifdef IPSTEALTH if (ipstealth && pass == 0) break; #endif code = cp - (u_char *)ip; if (optlen < 4 || optlen > 40) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < 5) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if (off > optlen - (int)sizeof(int32_t)) { cp[IPOPT_OFFSET + 1] += (1 << 4); if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } break; } off--; /* 0 origin */ sin = (struct in_addr *)(cp + off); switch (cp[IPOPT_OFFSET + 1] & 0x0f) { case IPOPT_TS_TSONLY: break; case IPOPT_TS_TSANDADDR: if (off + sizeof(n_time) + sizeof(struct in_addr) > optlen) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } ipaddr.sin_addr = dst; ia = (INA)ifaof_ifpforaddr((SA)&ipaddr, m->m_pkthdr.rcvif); if (ia == 0) continue; (void)memcpy(sin, &IA_SIN(ia)->sin_addr, sizeof(struct in_addr)); cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; case IPOPT_TS_PRESPEC: if (off + sizeof(n_time) + sizeof(struct in_addr) > optlen) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } (void)memcpy(&ipaddr.sin_addr, sin, sizeof(struct in_addr)); if (ifa_ifwithaddr((SA)&ipaddr) == 0) continue; cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; default: code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip; goto bad; } ntime = iptime(); (void)memcpy(cp + off, &ntime, sizeof(n_time)); cp[IPOPT_OFFSET] += sizeof(n_time); } } if (forward && ipforwarding) { ip_forward(m, 1, next_hop); return (1); } return (0); bad: icmp_error(m, type, code, 0, 0); ipstat.ips_badoptions++; return (1); } /* * Given address of next destination (final or next hop), * return internet address info of interface to be used to get there. */ struct in_ifaddr * ip_rtaddr(dst, rt) struct in_addr dst; struct route *rt; { register struct sockaddr_in *sin; sin = (struct sockaddr_in *)&rt->ro_dst; if (rt->ro_rt == 0 || !(rt->ro_rt->rt_flags & RTF_UP) || dst.s_addr != sin->sin_addr.s_addr) { if (rt->ro_rt) { RTFREE(rt->ro_rt); rt->ro_rt = 0; } sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = dst; rtalloc_ign(rt, RTF_PRCLONING); } if (rt->ro_rt == 0) return ((struct in_ifaddr *)0); return (ifatoia(rt->ro_rt->rt_ifa)); } /* * Save incoming source route for use in replies, * to be picked up later by ip_srcroute if the receiver is interested. */ static void save_rte(option, dst) u_char *option; struct in_addr dst; { unsigned olen; olen = option[IPOPT_OLEN]; #ifdef DIAGNOSTIC if (ipprintfs) printf("save_rte: olen %d\n", olen); #endif if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst))) return; bcopy(option, ip_srcrt.srcopt, olen); ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); ip_srcrt.dst = dst; } /* * Retrieve incoming source route for use in replies, * in the same form used by setsockopt. * The first hop is placed before the options, will be removed later. */ struct mbuf * ip_srcroute() { register struct in_addr *p, *q; register struct mbuf *m; if (ip_nhops == 0) return ((struct mbuf *)0); m = m_get(M_DONTWAIT, MT_HEADER); if (m == 0) return ((struct mbuf *)0); #define OPTSIZ (sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt)) /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) + OPTSIZ; #ifdef DIAGNOSTIC if (ipprintfs) printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len); #endif /* * First save first hop for return route */ p = &ip_srcrt.route[ip_nhops - 1]; *(mtod(m, struct in_addr *)) = *p--; #ifdef DIAGNOSTIC if (ipprintfs) printf(" hops %lx", (u_long)ntohl(mtod(m, struct in_addr *)->s_addr)); #endif /* * Copy option fields and padding (nop) to mbuf. */ ip_srcrt.nop = IPOPT_NOP; ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &ip_srcrt.nop, OPTSIZ); q = (struct in_addr *)(mtod(m, caddr_t) + sizeof(struct in_addr) + OPTSIZ); #undef OPTSIZ /* * Record return path as an IP source route, * reversing the path (pointers are now aligned). */ while (p >= ip_srcrt.route) { #ifdef DIAGNOSTIC if (ipprintfs) printf(" %lx", (u_long)ntohl(q->s_addr)); #endif *q++ = *p--; } /* * Last hop goes to final destination. */ *q = ip_srcrt.dst; #ifdef DIAGNOSTIC if (ipprintfs) printf(" %lx\n", (u_long)ntohl(q->s_addr)); #endif return (m); } /* * Strip out IP options, at higher * level protocol in the kernel. * Second argument is buffer to which options * will be moved, and return value is their length. * XXX should be deleted; last arg currently ignored. */ void ip_stripoptions(m, mopt) register struct mbuf *m; struct mbuf *mopt; { register int i; struct ip *ip = mtod(m, struct ip *); register caddr_t opts; int olen; olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); opts = (caddr_t)(ip + 1); i = m->m_len - (sizeof (struct ip) + olen); bcopy(opts + olen, opts, (unsigned)i); m->m_len -= olen; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len -= olen; ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> 2); } u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, 0, 0, ENOPROTOOPT, ECONNREFUSED }; /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * * The srcrt parameter indicates whether the packet is being forwarded * via a source route. */ static void ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) { struct ip *ip = mtod(m, struct ip *); struct rtentry *rt; int error, type = 0, code = 0; struct mbuf *mcopy; n_long dest; struct in_addr pkt_dst; struct ifnet *destifp; #ifdef IPSEC struct ifnet dummyifp; #endif dest = 0; /* * Cache the destination address of the packet; this may be * changed by use of 'ipfw fwd'. */ pkt_dst = next_hop ? next_hop->sin_addr : ip->ip_dst; #ifdef DIAGNOSTIC if (ipprintfs) printf("forward: src %lx dst %lx ttl %x\n", (u_long)ip->ip_src.s_addr, (u_long)pkt_dst.s_addr, ip->ip_ttl); #endif if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(pkt_dst) == 0) { ipstat.ips_cantforward++; m_freem(m); return; } #ifdef IPSTEALTH if (!ipstealth) { #endif if (ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0); return; } #ifdef IPSTEALTH } #endif if (ip_rtaddr(pkt_dst, &ipforward_rt) == 0) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); return; } else rt = ipforward_rt.ro_rt; /* * Save the IP header and at most 8 bytes of the payload, * in case we need to generate an ICMP message to the src. * * XXX this can be optimized a lot by saving the data in a local * buffer on the stack (72 bytes at most), and only allocating the * mbuf if really necessary. The vast majority of the packets * are forwarded without having to send an ICMP back (either * because unnecessary, or because rate limited), so we are * really we are wasting a lot of work here. * * We don't use m_copy() because it might return a reference * to a shared cluster. Both this function and ip_output() * assume exclusive access to the IP header in `m', so any * data in a cluster may change before we reach icmp_error(). */ MGET(mcopy, M_DONTWAIT, m->m_type); if (mcopy != NULL) { M_COPY_PKTHDR(mcopy, m); mcopy->m_len = imin((IP_VHL_HL(ip->ip_vhl) << 2) + 8, (int)ip->ip_len); m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); #ifdef MAC /* * XXXMAC: This will eventually become an explicit * labeling point. */ mac_create_mbuf_from_mbuf(m, mcopy); #endif } #ifdef IPSTEALTH if (!ipstealth) { #endif ip->ip_ttl -= IPTTLDEC; #ifdef IPSTEALTH } #endif /* * If forwarding packet using same interface that it came in on, * perhaps should send a redirect to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a default route * or a route modified by a redirect. */ if (rt->rt_ifp == m->m_pkthdr.rcvif && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && satosin(rt_key(rt))->sin_addr.s_addr != 0 && ipsendredirects && !srcrt && !next_hop) { #define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) u_long src = ntohl(ip->ip_src.s_addr); if (RTA(rt) && (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { if (rt->rt_flags & RTF_GATEWAY) dest = satosin(rt->rt_gateway)->sin_addr.s_addr; else dest = pkt_dst.s_addr; /* Router requirements says to only send host redirects */ type = ICMP_REDIRECT; code = ICMP_REDIRECT_HOST; #ifdef DIAGNOSTIC if (ipprintfs) printf("redirect (%d) to %lx\n", code, (u_long)dest); #endif } } { struct m_hdr tag; if (next_hop) { /* Pass IPFORWARD info if available */ tag.mh_type = MT_TAG; tag.mh_flags = PACKET_TAG_IPFORWARD; tag.mh_data = (caddr_t)next_hop; tag.mh_next = m; m = (struct mbuf *)&tag; } error = ip_output(m, (struct mbuf *)0, &ipforward_rt, - IP_FORWARDING, 0); + IP_FORWARDING, 0, NULL); } if (error) ipstat.ips_cantforward++; else { ipstat.ips_forward++; if (type) ipstat.ips_redirectsent++; else { if (mcopy) { ipflow_create(&ipforward_rt, mcopy); m_freem(mcopy); } return; } } if (mcopy == NULL) return; destifp = NULL; switch (error) { case 0: /* forwarded, but need redirect */ /* type, code set above */ break; case ENETUNREACH: /* shouldn't happen, checked above */ case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; break; case EMSGSIZE: type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; -#ifndef IPSEC - if (ipforward_rt.ro_rt) - destifp = ipforward_rt.ro_rt->rt_ifp; -#else +#ifdef IPSEC /* * If the packet is routed over IPsec tunnel, tell the * originator the tunnel MTU. * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz * XXX quickhack!!! */ if (ipforward_rt.ro_rt) { struct secpolicy *sp = NULL; int ipsecerror; int ipsechdr; struct route *ro; sp = ipsec4_getpolicybyaddr(mcopy, IPSEC_DIR_OUTBOUND, IP_FORWARDING, &ipsecerror); if (sp == NULL) destifp = ipforward_rt.ro_rt->rt_ifp; else { /* count IPsec header size */ ipsechdr = ipsec4_hdrsiz(mcopy, IPSEC_DIR_OUTBOUND, NULL); /* * find the correct route for outer IPv4 * header, compute tunnel MTU. * * XXX BUG ALERT * The "dummyifp" code relies upon the fact * that icmp_error() touches only ifp->if_mtu. */ /*XXX*/ destifp = NULL; if (sp->req != NULL && sp->req->sav != NULL && sp->req->sav->sah != NULL) { ro = &sp->req->sav->sah->sa_route; if (ro->ro_rt && ro->ro_rt->rt_ifp) { dummyifp.if_mtu = ro->ro_rt->rt_ifp->if_mtu; dummyifp.if_mtu -= ipsechdr; destifp = &dummyifp; } } key_freesp(sp); } } +#else + if (ipforward_rt.ro_rt) + destifp = ipforward_rt.ro_rt->rt_ifp; #endif /*IPSEC*/ ipstat.ips_cantfrag++; break; case ENOBUFS: type = ICMP_SOURCEQUENCH; code = 0; break; case EACCES: /* ipfw denied packet */ m_freem(mcopy); return; } icmp_error(mcopy, type, code, dest, destifp); } void ip_savecontrol(inp, mp, ip, m) register struct inpcb *inp; register struct mbuf **mp; register struct ip *ip; register struct mbuf *m; { if (inp->inp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; microtime(&tv); *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVDSTADDR) { *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef notyet /* XXX * Moving these out of udp_input() made them even more broken * than they already were. */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { *mp = sbcreatecontrol((caddr_t) opts_deleted_above, sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { *mp = sbcreatecontrol((caddr_t) ip_srcroute(), sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #endif if (inp->inp_flags & INP_RECVIF) { struct ifnet *ifp; struct sdlbuf { struct sockaddr_dl sdl; u_char pad[32]; } sdlbuf; struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if (((ifp = m->m_pkthdr.rcvif)) && ( ifp->if_index && (ifp->if_index <= if_index))) { sdp = (struct sockaddr_dl *) (ifaddr_byindex(ifp->if_index)->ifa_addr); /* * Change our mind and don't try copy. */ if ((sdp->sdl_family != AF_LINK) || (sdp->sdl_len > sizeof(sdlbuf))) { goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); } else { makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, IP_RECVIF, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } } /* * XXX these routines are called from the upper part of the kernel. * They need to be locked when we remove Giant. * * They could also be moved to ip_mroute.c, since all the RSVP * handling is done there already. */ static int ip_rsvp_on; struct socket *ip_rsvpd; int ip_rsvp_init(struct socket *so) { if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; if (ip_rsvpd != NULL) return EADDRINUSE; ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!ip_rsvp_on) { ip_rsvp_on = 1; rsvp_on++; } return 0; } int ip_rsvp_done(void) { ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (ip_rsvp_on) { ip_rsvp_on = 0; rsvp_on--; } return 0; } Index: head/sys/netinet/ip_mroute.c =================================================================== --- head/sys/netinet/ip_mroute.c (revision 105193) +++ head/sys/netinet/ip_mroute.c (revision 105194) @@ -1,2257 +1,2257 @@ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenner, PARC, April 1995 * * MROUTING Revision: 3.5 * $FreeBSD$ */ #include "opt_mrouting.h" #include "opt_random_ip_id.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef MROUTING extern u_long _ip_mcast_src(int vifi); extern int _ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo); extern int _ip_mrouter_done(void); extern int _ip_mrouter_get(struct socket *so, struct sockopt *sopt); extern int _ip_mrouter_set(struct socket *so, struct sockopt *sopt); extern int _mrt_ioctl(int req, caddr_t data); /* * Dummy routines and globals used when multicast routing is not compiled in. */ struct socket *ip_mrouter = NULL; u_int rsvpdebug = 0; int _ip_mrouter_set(so, sopt) struct socket *so; struct sockopt *sopt; { return(EOPNOTSUPP); } int (*ip_mrouter_set)(struct socket *, struct sockopt *) = _ip_mrouter_set; int _ip_mrouter_get(so, sopt) struct socket *so; struct sockopt *sopt; { return(EOPNOTSUPP); } int (*ip_mrouter_get)(struct socket *, struct sockopt *) = _ip_mrouter_get; int _ip_mrouter_done() { return(0); } int (*ip_mrouter_done)(void) = _ip_mrouter_done; int _ip_mforward(ip, ifp, m, imo) struct ip *ip; struct ifnet *ifp; struct mbuf *m; struct ip_moptions *imo; { return(0); } int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *) = _ip_mforward; int _mrt_ioctl(int req, caddr_t data) { return EOPNOTSUPP; } int (*mrt_ioctl)(int, caddr_t) = _mrt_ioctl; void rsvp_input(m, off) /* XXX must fixup manually */ struct mbuf *m; int off; { /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!rsvp_on) { m_freem(m); return; } if (ip_rsvpd != NULL) { if (rsvpdebug) printf("rsvp_input: Sending packet up old-style socket\n"); rip_input(m, off); return; } /* Drop the packet */ m_freem(m); } int (*legal_vif_num)(int) = 0; /* * This should never be called, since IP_MULTICAST_VIF should fail, but * just in case it does get called, the code a little lower in ip_output * will assign the packet a local address. */ u_long _ip_mcast_src(int vifi) { return INADDR_ANY; } u_long (*ip_mcast_src)(int) = _ip_mcast_src; int ip_rsvp_vif_init(so, sopt) struct socket *so; struct sockopt *sopt; { return(EINVAL); } int ip_rsvp_vif_done(so, sopt) struct socket *so; struct sockopt *sopt; { return(EINVAL); } void ip_rsvp_force_done(so) struct socket *so; { return; } #else /* MROUTING */ #define M_HASCL(m) ((m)->m_flags & M_EXT) static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables"); #ifndef MROUTE_KLD /* The socket used to communicate with the multicast routing daemon. */ struct socket *ip_mrouter = NULL; #endif #if defined(MROUTING) || defined(MROUTE_KLD) static struct mrtstat mrtstat; SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW, &mrtstat, mrtstat, "Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)"); #endif static struct mfc *mfctable[MFCTBLSIZ]; static u_char nexpire[MFCTBLSIZ]; static struct vif viftable[MAXVIFS]; static u_int mrtdebug = 0; /* debug level */ #define DEBUG_MFC 0x02 #define DEBUG_FORWARD 0x04 #define DEBUG_EXPIRE 0x08 #define DEBUG_XMIT 0x10 static u_int tbfdebug = 0; /* tbf debug level */ static u_int rsvpdebug = 0; /* rsvp debug level */ static struct callout_handle expire_upcalls_ch; #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ /* * Define the token bucket filter structures * tbftable -> each vif has one of these for storing info */ static struct tbf tbftable[MAXVIFS]; #define TBF_REPROCESS (hz / 100) /* 100x / second */ /* * 'Interfaces' associated with decapsulator (so we can tell * packets that went through it from ones that get reflected * by a broken gateway). These interfaces are never linked into * the system ifnet list & no routes point to them. I.e., packets * can't be sent this way. They only exist as a placeholder for * multicast source verification. */ static struct ifnet multicast_decap_if[MAXVIFS]; #define ENCAP_TTL 64 #define ENCAP_PROTO IPPROTO_IPIP /* 4 */ /* prototype IP hdr for encapsulated packets */ static struct ip multicast_encap_iphdr = { #if BYTE_ORDER == LITTLE_ENDIAN sizeof(struct ip) >> 2, IPVERSION, #else IPVERSION, sizeof(struct ip) >> 2, #endif 0, /* tos */ sizeof(struct ip), /* total length */ 0, /* id */ 0, /* frag offset */ ENCAP_TTL, ENCAP_PROTO, 0, /* checksum */ }; /* * Private variables. */ static vifi_t numvifs = 0; static const struct encaptab *encap_cookie = NULL; /* * one-back cache used by mroute_encapcheck to locate a tunnel's vif * given a datagram's src ip address. */ static u_long last_encap_src; static struct vif *last_encap_vif; static u_long X_ip_mcast_src(int vifi); static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo); static int X_ip_mrouter_done(void); static int X_ip_mrouter_get(struct socket *so, struct sockopt *m); static int X_ip_mrouter_set(struct socket *so, struct sockopt *m); static int X_legal_vif_num(int vif); static int X_mrt_ioctl(int cmd, caddr_t data); static int get_sg_cnt(struct sioc_sg_req *); static int get_vif_cnt(struct sioc_vif_req *); static int ip_mrouter_init(struct socket *, int); static int add_vif(struct vifctl *); static int del_vif(vifi_t); static int add_mfc(struct mfcctl *); static int del_mfc(struct mfcctl *); static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); static int set_assert(int); static void expire_upcalls(void *); static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); static void phyint_send(struct ip *, struct vif *, struct mbuf *); static void encap_send(struct ip *, struct vif *, struct mbuf *); static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long); static void tbf_queue(struct vif *, struct mbuf *); static void tbf_process_q(struct vif *); static void tbf_reprocess_q(void *); static int tbf_dq_sel(struct vif *, struct ip *); static void tbf_send_packet(struct vif *, struct mbuf *); static void tbf_update_tokens(struct vif *); static int priority(struct vif *, struct ip *); /* * whether or not special PIM assert processing is enabled. */ static int pim_assert; /* * Rate limit for assert notification messages, in usec */ #define ASSERT_MSG_TIME 3000000 /* * Hash function for a source, group entry */ #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ ((g) >> 20) ^ ((g) >> 10) ^ (g)) /* * Find a route for a given origin IP address and Multicast group address * Type of service parameter to be added in the future!!! */ #define MFCFIND(o, g, rt) { \ register struct mfc *_rt = mfctable[MFCHASH(o,g)]; \ rt = NULL; \ ++mrtstat.mrts_mfc_lookups; \ while (_rt) { \ if ((_rt->mfc_origin.s_addr == o) && \ (_rt->mfc_mcastgrp.s_addr == g) && \ (_rt->mfc_stall == NULL)) { \ rt = _rt; \ break; \ } \ _rt = _rt->mfc_next; \ } \ if (rt == NULL) { \ ++mrtstat.mrts_mfc_misses; \ } \ } /* * Macros to compute elapsed time efficiently * Borrowed from Van Jacobson's scheduling code */ #define TV_DELTA(a, b, delta) { \ register int xxs; \ \ delta = (a).tv_usec - (b).tv_usec; \ if ((xxs = (a).tv_sec - (b).tv_sec)) { \ switch (xxs) { \ case 2: \ delta += 1000000; \ /* FALLTHROUGH */ \ case 1: \ delta += 1000000; \ break; \ default: \ delta += (1000000 * xxs); \ } \ } \ } #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) #ifdef UPCALL_TIMING u_long upcall_data[51]; static void collate(struct timeval *); #endif /* UPCALL_TIMING */ /* * Handle MRT setsockopt commands to modify the multicast routing tables. */ static int X_ip_mrouter_set(so, sopt) struct socket *so; struct sockopt *sopt; { int error, optval; vifi_t vifi; struct vifctl vifc; struct mfcctl mfc; if (so != ip_mrouter && sopt->sopt_name != MRT_INIT) return (EPERM); error = 0; switch (sopt->sopt_name) { case MRT_INIT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; error = ip_mrouter_init(so, optval); break; case MRT_DONE: error = ip_mrouter_done(); break; case MRT_ADD_VIF: error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); if (error) break; error = add_vif(&vifc); break; case MRT_DEL_VIF: error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); if (error) break; error = del_vif(vifi); break; case MRT_ADD_MFC: case MRT_DEL_MFC: error = sooptcopyin(sopt, &mfc, sizeof mfc, sizeof mfc); if (error) break; if (sopt->sopt_name == MRT_ADD_MFC) error = add_mfc(&mfc); else error = del_mfc(&mfc); break; case MRT_ASSERT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; set_assert(optval); break; default: error = EOPNOTSUPP; break; } return (error); } #ifndef MROUTE_KLD int (*ip_mrouter_set)(struct socket *, struct sockopt *) = X_ip_mrouter_set; #endif /* * Handle MRT getsockopt commands */ static int X_ip_mrouter_get(so, sopt) struct socket *so; struct sockopt *sopt; { int error; static int version = 0x0305; /* !!! why is this here? XXX */ switch (sopt->sopt_name) { case MRT_VERSION: error = sooptcopyout(sopt, &version, sizeof version); break; case MRT_ASSERT: error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert); break; default: error = EOPNOTSUPP; break; } return (error); } #ifndef MROUTE_KLD int (*ip_mrouter_get)(struct socket *, struct sockopt *) = X_ip_mrouter_get; #endif /* * Handle ioctl commands to obtain information from the cache */ static int X_mrt_ioctl(cmd, data) int cmd; caddr_t data; { int error = 0; switch (cmd) { case (SIOCGETVIFCNT): return (get_vif_cnt((struct sioc_vif_req *)data)); break; case (SIOCGETSGCNT): return (get_sg_cnt((struct sioc_sg_req *)data)); break; default: return (EINVAL); break; } return error; } #ifndef MROUTE_KLD int (*mrt_ioctl)(int, caddr_t) = X_mrt_ioctl; #endif /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(req) register struct sioc_sg_req *req; { register struct mfc *rt; int s; s = splnet(); MFCFIND(req->src.s_addr, req->grp.s_addr, rt); splx(s); if (rt != NULL) { req->pktcnt = rt->mfc_pkt_cnt; req->bytecnt = rt->mfc_byte_cnt; req->wrong_if = rt->mfc_wrong_if; } else req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; return 0; } /* * returns the input and output packet and byte counts on the vif provided */ static int get_vif_cnt(req) register struct sioc_vif_req *req; { register vifi_t vifi = req->vifi; if (vifi >= numvifs) return EINVAL; req->icount = viftable[vifi].v_pkt_in; req->ocount = viftable[vifi].v_pkt_out; req->ibytes = viftable[vifi].v_bytes_in; req->obytes = viftable[vifi].v_bytes_out; return 0; } /* * Enable multicast routing */ static int ip_mrouter_init(so, version) struct socket *so; int version; { if (mrtdebug) log(LOG_DEBUG,"ip_mrouter_init: so_type = %d, pr_protocol = %d\n", so->so_type, so->so_proto->pr_protocol); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP; if (version != 1) return ENOPROTOOPT; if (ip_mrouter != NULL) return EADDRINUSE; ip_mrouter = so; bzero((caddr_t)mfctable, sizeof(mfctable)); bzero((caddr_t)nexpire, sizeof(nexpire)); pim_assert = 0; expire_upcalls_ch = timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); if (mrtdebug) log(LOG_DEBUG, "ip_mrouter_init\n"); return 0; } /* * Disable multicast routing */ static int X_ip_mrouter_done() { vifi_t vifi; int i; struct ifnet *ifp; struct ifreq ifr; struct mfc *rt; struct rtdetq *rte; int s; s = splnet(); /* * For each phyint in use, disable promiscuous reception of all IP * multicasts. */ for (vifi = 0; vifi < numvifs; vifi++) { if (viftable[vifi].v_lcl_addr.s_addr != 0 && !(viftable[vifi].v_flags & VIFF_TUNNEL)) { ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET; ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr = INADDR_ANY; ifp = viftable[vifi].v_ifp; if_allmulti(ifp, 0); } } bzero((caddr_t)tbftable, sizeof(tbftable)); bzero((caddr_t)viftable, sizeof(viftable)); numvifs = 0; pim_assert = 0; untimeout(expire_upcalls, (caddr_t)NULL, expire_upcalls_ch); /* * Free all multicast forwarding cache entries. */ for (i = 0; i < MFCTBLSIZ; i++) { for (rt = mfctable[i]; rt != NULL; ) { struct mfc *nr = rt->mfc_next; for (rte = rt->mfc_stall; rte != NULL; ) { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE); rte = n; } free(rt, M_MRTABLE); rt = nr; } } bzero((caddr_t)mfctable, sizeof(mfctable)); /* * Reset de-encapsulation cache */ last_encap_src = 0; last_encap_vif = NULL; if (encap_cookie) { encap_detach(encap_cookie); encap_cookie = NULL; } ip_mrouter = NULL; splx(s); if (mrtdebug) log(LOG_DEBUG, "ip_mrouter_done\n"); return 0; } #ifndef MROUTE_KLD int (*ip_mrouter_done)(void) = X_ip_mrouter_done; #endif /* * Set PIM assert processing global */ static int set_assert(i) int i; { if ((i != 1) && (i != 0)) return EINVAL; pim_assert = i; return 0; } /* * Decide if a packet is from a tunnelled peer. * Return 0 if not, 64 if so. */ static int mroute_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { struct ip *ip = mtod(m, struct ip *); int hlen = ip->ip_hl << 2; register struct vif *vifp; /* * don't claim the packet if it's not to a multicast destination or if * we don't have an encapsulating tunnel with the source. * Note: This code assumes that the remote site IP address * uniquely identifies the tunnel (i.e., that this site has * at most one tunnel with the remote site). */ if (! IN_MULTICAST(ntohl(((struct ip *)((char *)ip + hlen))->ip_dst.s_addr))) { return 0; } if (ip->ip_src.s_addr != last_encap_src) { register struct vif *vife; vifp = viftable; vife = vifp + numvifs; last_encap_src = ip->ip_src.s_addr; last_encap_vif = 0; for ( ; vifp < vife; ++vifp) if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) { if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT)) == VIFF_TUNNEL) last_encap_vif = vifp; break; } } if ((vifp = last_encap_vif) == 0) { last_encap_src = 0; return 0; } return 64; } /* * De-encapsulate a packet and feed it back through ip input (this * routine is called whenever IP gets a packet that mroute_encap_func() * claimed). */ static void mroute_encap_input(struct mbuf *m, int off) { struct ip *ip = mtod(m, struct ip *); int hlen = ip->ip_hl << 2; if (hlen > sizeof(struct ip)) ip_stripoptions(m, (struct mbuf *) 0); m->m_data += sizeof(struct ip); m->m_len -= sizeof(struct ip); m->m_pkthdr.len -= sizeof(struct ip); m->m_pkthdr.rcvif = last_encap_vif->v_ifp; (void) IF_HANDOFF(&ipintrq, m, NULL); /* * normally we would need a "schednetisr(NETISR_IP)" * here but we were called by ip_input and it is going * to loop back & try to dequeue the packet we just * queued as soon as we return so we avoid the * unnecessary software interrrupt. */ } extern struct domain inetdomain; static struct protosw mroute_encap_protosw = { SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR, mroute_encap_input, 0, 0, rip_ctloutput, 0, 0, 0, 0, 0, &rip_usrreqs }; /* * Add a vif to the vif table */ static int add_vif(vifcp) register struct vifctl *vifcp; { register struct vif *vifp = viftable + vifcp->vifc_vifi; static struct sockaddr_in sin = {sizeof sin, AF_INET}; struct ifaddr *ifa; struct ifnet *ifp; int error, s; struct tbf *v_tbf = tbftable + vifcp->vifc_vifi; if (vifcp->vifc_vifi >= MAXVIFS) return EINVAL; if (vifp->v_lcl_addr.s_addr != 0) return EADDRINUSE; /* Find the interface with an address in AF_INET family */ sin.sin_addr = vifcp->vifc_lcl_addr; ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == 0) return EADDRNOTAVAIL; ifp = ifa->ifa_ifp; if (vifcp->vifc_flags & VIFF_TUNNEL) { if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) { /* * An encapsulating tunnel is wanted. Tell * mroute_encap_input() to start paying attention * to encapsulated packets. */ if (encap_cookie == NULL) { encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4, mroute_encapcheck, (struct protosw *)&mroute_encap_protosw, NULL); if (encap_cookie == NULL) { printf("ip_mroute: unable to attach encap\n"); return (EIO); /* XXX */ } for (s = 0; s < MAXVIFS; ++s) { multicast_decap_if[s].if_name = "mdecap"; multicast_decap_if[s].if_unit = s; } } /* * Set interface to fake encapsulator interface */ ifp = &multicast_decap_if[vifcp->vifc_vifi]; /* * Prepare cached route entry */ bzero(&vifp->v_route, sizeof(vifp->v_route)); } else { log(LOG_ERR, "source routed tunnels not supported\n"); return EOPNOTSUPP; } } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return EOPNOTSUPP; /* Enable promiscuous reception of all IP multicasts from the if */ s = splnet(); error = if_allmulti(ifp, 1); splx(s); if (error) return error; } s = splnet(); /* define parameters for the tbf structure */ vifp->v_tbf = v_tbf; GET_TIME(vifp->v_tbf->tbf_last_pkt_t); vifp->v_tbf->tbf_n_tok = 0; vifp->v_tbf->tbf_q_len = 0; vifp->v_tbf->tbf_max_q_len = MAXQSIZE; vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; vifp->v_flags = vifcp->vifc_flags; vifp->v_threshold = vifcp->vifc_threshold; vifp->v_lcl_addr = vifcp->vifc_lcl_addr; vifp->v_rmt_addr = vifcp->vifc_rmt_addr; vifp->v_ifp = ifp; /* scaling up here allows division by 1024 in critical code */ vifp->v_rate_limit= vifcp->vifc_rate_limit * 1024 / 1000; vifp->v_rsvp_on = 0; vifp->v_rsvpd = NULL; /* initialize per vif pkt counters */ vifp->v_pkt_in = 0; vifp->v_pkt_out = 0; vifp->v_bytes_in = 0; vifp->v_bytes_out = 0; splx(s); /* Adjust numvifs up if the vifi is higher than numvifs */ if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1; if (mrtdebug) log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x, rate %d\n", vifcp->vifc_vifi, (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr), (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr), vifcp->vifc_threshold, vifcp->vifc_rate_limit); return 0; } /* * Delete a vif from the vif table */ static int del_vif(vifi) vifi_t vifi; { register struct vif *vifp = &viftable[vifi]; register struct mbuf *m; struct ifnet *ifp; struct ifreq ifr; int s; if (vifi >= numvifs) return EINVAL; if (vifp->v_lcl_addr.s_addr == 0) return EADDRNOTAVAIL; s = splnet(); if (!(vifp->v_flags & VIFF_TUNNEL)) { ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET; ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr = INADDR_ANY; ifp = vifp->v_ifp; if_allmulti(ifp, 0); } if (vifp == last_encap_vif) { last_encap_vif = 0; last_encap_src = 0; } /* * Free packets queued at the interface */ while (vifp->v_tbf->tbf_q) { m = vifp->v_tbf->tbf_q; vifp->v_tbf->tbf_q = m->m_act; m_freem(m); } bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf))); bzero((caddr_t)vifp, sizeof (*vifp)); if (mrtdebug) log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs); /* Adjust numvifs down */ for (vifi = numvifs; vifi > 0; vifi--) if (viftable[vifi-1].v_lcl_addr.s_addr != 0) break; numvifs = vifi; splx(s); return 0; } /* * Add an mfc entry */ static int add_mfc(mfccp) struct mfcctl *mfccp; { struct mfc *rt; u_long hash; struct rtdetq *rte; register u_short nstl; int s; int i; MFCFIND(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr, rt); /* If an entry already exists, just update the fields */ if (rt) { if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n", (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); s = splnet(); rt->mfc_parent = mfccp->mfcc_parent; for (i = 0; i < numvifs; i++) rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; splx(s); return 0; } /* * Find the entry for which the upcall was made and update */ s = splnet(); hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) { if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && (rt->mfc_stall != NULL)) { if (nstl++) log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n", "multiple kernel entries", (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, (void *)rt->mfc_stall); if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n", (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, (void *)rt->mfc_stall); rt->mfc_origin = mfccp->mfcc_origin; rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; rt->mfc_parent = mfccp->mfcc_parent; for (i = 0; i < numvifs; i++) rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; rt->mfc_expire = 0; /* Don't clean this guy up */ nexpire[hash]--; /* free packets Qed at the end of this entry */ for (rte = rt->mfc_stall; rte != NULL; ) { struct rtdetq *n = rte->next; ip_mdq(rte->m, rte->ifp, rt, -1); m_freem(rte->m); #ifdef UPCALL_TIMING collate(&(rte->t)); #endif /* UPCALL_TIMING */ free(rte, M_MRTABLE); rte = n; } rt->mfc_stall = NULL; } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n", hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) { if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) { rt->mfc_origin = mfccp->mfcc_origin; rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; rt->mfc_parent = mfccp->mfcc_parent; for (i = 0; i < numvifs; i++) rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; if (rt->mfc_expire) nexpire[hash]--; rt->mfc_expire = 0; } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { splx(s); return ENOBUFS; } /* insert new entry at head of hash chain */ rt->mfc_origin = mfccp->mfcc_origin; rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; rt->mfc_parent = mfccp->mfcc_parent; for (i = 0; i < numvifs; i++) rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; rt->mfc_expire = 0; rt->mfc_stall = NULL; /* link into table */ rt->mfc_next = mfctable[hash]; mfctable[hash] = rt; } } splx(s); return 0; } #ifdef UPCALL_TIMING /* * collect delay statistics on the upcalls */ static void collate(t) register struct timeval *t; { register u_long d; register struct timeval tp; register u_long delta; GET_TIME(tp); if (TV_LT(*t, tp)) { TV_DELTA(tp, *t, delta); d = delta >> 10; if (d > 50) d = 50; ++upcall_data[d]; } } #endif /* UPCALL_TIMING */ /* * Delete an mfc entry */ static int del_mfc(mfccp) struct mfcctl *mfccp; { struct in_addr origin; struct in_addr mcastgrp; struct mfc *rt; struct mfc **nptr; u_long hash; int s; origin = mfccp->mfcc_origin; mcastgrp = mfccp->mfcc_mcastgrp; hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); if (mrtdebug & DEBUG_MFC) log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n", (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); s = splnet(); nptr = &mfctable[hash]; while ((rt = *nptr) != NULL) { if (origin.s_addr == rt->mfc_origin.s_addr && mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && rt->mfc_stall == NULL) break; nptr = &rt->mfc_next; } if (rt == NULL) { splx(s); return EADDRNOTAVAIL; } *nptr = rt->mfc_next; free(rt, M_MRTABLE); splx(s); return 0; } /* * Send a message to mrouted on the multicast routing socket */ static int socket_send(s, mm, src) struct socket *s; struct mbuf *mm; struct sockaddr_in *src; { if (s) { if (sbappendaddr(&s->so_rcv, (struct sockaddr *)src, mm, (struct mbuf *)0) != 0) { sorwakeup(s); return 0; } } m_freem(mm); return -1; } /* * IP multicast forwarding function. This function assumes that the packet * pointed to by "ip" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IP multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. */ #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ static int X_ip_mforward(ip, ifp, m, imo) register struct ip *ip; struct ifnet *ifp; struct mbuf *m; struct ip_moptions *imo; { register struct mfc *rt; register u_char *ipoptions; static struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; static int srctun = 0; register struct mbuf *mm; int s; vifi_t vifi; struct vif *vifp; if (mrtdebug & DEBUG_FORWARD) log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n", (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), (void *)ifp); if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || (ipoptions = (u_char *)(ip + 1))[1] != IPOPT_LSRR ) { /* * Packet arrived via a physical interface or * an encapsulated tunnel. */ } else { /* * Packet arrived through a source-route tunnel. * Source-route tunnels are no longer supported. */ if ((srctun++ % 1000) == 0) log(LOG_ERR, "ip_mforward: received source-routed packet from %lx\n", (u_long)ntohl(ip->ip_src.s_addr)); return 1; } if ((imo) && ((vifi = imo->imo_multicast_vif) < numvifs)) { if (ip->ip_ttl < 255) ip->ip_ttl++; /* compensate for -1 in *_send routines */ if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { vifp = viftable + vifi; printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s%d)\n", (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr), vifi, (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "", vifp->v_ifp->if_name, vifp->v_ifp->if_unit); } return (ip_mdq(m, ifp, NULL, vifi)); } if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n", (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr)); if(!imo) printf("In fact, no options were specified at all\n"); } /* * Don't forward a packet with time-to-live of zero or one, * or a packet destined to a local-only group. */ if (ip->ip_ttl <= 1 || ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP) return 0; /* * Determine forwarding vifs from the forwarding cache table */ s = splnet(); MFCFIND(ip->ip_src.s_addr, ip->ip_dst.s_addr, rt); /* Entry exists, so forward if necessary */ if (rt != NULL) { splx(s); return (ip_mdq(m, ifp, rt, -1)); } else { /* * If we don't have a route for packet's origin, * Make a copy of the packet & * send message to routing daemon */ register struct mbuf *mb0; register struct rtdetq *rte; register u_long hash; int hlen = ip->ip_hl << 2; #ifdef UPCALL_TIMING struct timeval tp; GET_TIME(tp); #endif mrtstat.mrts_no_route++; if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n", (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr)); /* * Allocate mbufs early so that we don't do extra work if we are * just going to fail anyway. Make sure to pullup the header so * that other people can't step on it. */ rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT); if (rte == NULL) { splx(s); return ENOBUFS; } mb0 = m_copy(m, 0, M_COPYALL); if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) mb0 = m_pullup(mb0, hlen); if (mb0 == NULL) { free(rte, M_MRTABLE); splx(s); return ENOBUFS; } /* is there an upcall waiting for this packet? */ hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr); for (rt = mfctable[hash]; rt; rt = rt->mfc_next) { if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) && (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) && (rt->mfc_stall != NULL)) break; } if (rt == NULL) { int i; struct igmpmsg *im; /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { free(rte, M_MRTABLE); m_freem(mb0); splx(s); return ENOBUFS; } /* Make a copy of the header to send to the user level process */ mm = m_copy(mb0, 0, hlen); if (mm == NULL) { free(rte, M_MRTABLE); m_freem(mb0); free(rt, M_MRTABLE); splx(s); return ENOBUFS; } /* * Send message to routing daemon to install * a route into the kernel table */ k_igmpsrc.sin_addr = ip->ip_src; im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_NOCACHE; im->im_mbz = 0; mrtstat.mrts_upcalls++; if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; free(rte, M_MRTABLE); m_freem(mb0); free(rt, M_MRTABLE); splx(s); return ENOBUFS; } /* insert new entry at head of hash chain */ rt->mfc_origin.s_addr = ip->ip_src.s_addr; rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; rt->mfc_expire = UPCALL_EXPIRE; nexpire[hash]++; for (i = 0; i < numvifs; i++) rt->mfc_ttls[i] = 0; rt->mfc_parent = -1; /* link into table */ rt->mfc_next = mfctable[hash]; mfctable[hash] = rt; rt->mfc_stall = rte; } else { /* determine if q has overflowed */ int npkts = 0; struct rtdetq **p; for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) npkts++; if (npkts > MAX_UPQ) { mrtstat.mrts_upq_ovflw++; free(rte, M_MRTABLE); m_freem(mb0); splx(s); return 0; } /* Add this entry to the end of the queue */ *p = rte; } rte->m = mb0; rte->ifp = ifp; #ifdef UPCALL_TIMING rte->t = tp; #endif rte->next = NULL; splx(s); return 0; } } #ifndef MROUTE_KLD int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *) = X_ip_mforward; #endif /* * Clean up the cache entry if upcall is not serviced */ static void expire_upcalls(void *unused) { struct rtdetq *rte; struct mfc *mfc, **nptr; int i; int s; s = splnet(); for (i = 0; i < MFCTBLSIZ; i++) { if (nexpire[i] == 0) continue; nptr = &mfctable[i]; for (mfc = *nptr; mfc != NULL; mfc = *nptr) { /* * Skip real cache entries * Make sure it wasn't marked to not expire (shouldn't happen) * If it expires now */ if (mfc->mfc_stall != NULL && mfc->mfc_expire != 0 && --mfc->mfc_expire == 0) { if (mrtdebug & DEBUG_EXPIRE) log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n", (u_long)ntohl(mfc->mfc_origin.s_addr), (u_long)ntohl(mfc->mfc_mcastgrp.s_addr)); /* * drop all the packets * free the mbuf with the pkt, if, timing info */ for (rte = mfc->mfc_stall; rte; ) { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE); rte = n; } ++mrtstat.mrts_cache_cleanups; nexpire[i]--; *nptr = mfc->mfc_next; free(mfc, M_MRTABLE); } else { nptr = &mfc->mfc_next; } } } splx(s); expire_upcalls_ch = timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); } /* * Packet forwarding routine once entry in the cache is made */ static int ip_mdq(m, ifp, rt, xmt_vif) register struct mbuf *m; register struct ifnet *ifp; register struct mfc *rt; register vifi_t xmt_vif; { register struct ip *ip = mtod(m, struct ip *); register vifi_t vifi; register struct vif *vifp; register int plen = ip->ip_len; /* * Macro to send packet on vif. Since RSVP packets don't get counted on * input, they shouldn't get counted on output, so statistics keeping is * separate. */ #define MC_SEND(ip,vifp,m) { \ if ((vifp)->v_flags & VIFF_TUNNEL) \ encap_send((ip), (vifp), (m)); \ else \ phyint_send((ip), (vifp), (m)); \ } /* * If xmt_vif is not -1, send on only the requested vif. * * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) */ if (xmt_vif < numvifs) { MC_SEND(ip, viftable + xmt_vif, m); return 1; } /* * Don't forward if it didn't arrive from the parent vif for its origin. */ vifi = rt->mfc_parent; if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { /* came in the wrong interface */ if (mrtdebug & DEBUG_FORWARD) log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", (void *)ifp, vifi, (void *)viftable[vifi].v_ifp); ++mrtstat.mrts_wrong_if; ++rt->mfc_wrong_if; /* * If we are doing PIM assert processing, and we are forwarding * packets on this interface, and it is a broadcast medium * interface (and not a tunnel), send a message to the routing daemon. */ if (pim_assert && rt->mfc_ttls[vifi] && (ifp->if_flags & IFF_BROADCAST) && !(viftable[vifi].v_flags & VIFF_TUNNEL)) { struct sockaddr_in k_igmpsrc; struct mbuf *mm; struct igmpmsg *im; int hlen = ip->ip_hl << 2; struct timeval now; register u_long delta; GET_TIME(now); TV_DELTA(rt->mfc_last_assert, now, delta); if (delta > ASSERT_MSG_TIME) { mm = m_copy(m, 0, hlen); if (mm && (M_HASCL(mm) || mm->m_len < hlen)) mm = m_pullup(mm, hlen); if (mm == NULL) { return ENOBUFS; } rt->mfc_last_assert = now; im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_WRONGVIF; im->im_mbz = 0; im->im_vif = vifi; k_igmpsrc.sin_addr = im->im_src; socket_send(ip_mrouter, mm, &k_igmpsrc); } } return 0; } /* If I sourced this packet, it counts as output, else it was input. */ if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) { viftable[vifi].v_pkt_out++; viftable[vifi].v_bytes_out += plen; } else { viftable[vifi].v_pkt_in++; viftable[vifi].v_bytes_in += plen; } rt->mfc_pkt_cnt++; rt->mfc_byte_cnt += plen; /* * For each vif, decide if a copy of the packet should be forwarded. * Forward if: * - the ttl exceeds the vif's threshold * - there are group members downstream on interface */ for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++) if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { vifp->v_pkt_out++; vifp->v_bytes_out += plen; MC_SEND(ip, vifp, m); } return 0; } /* * check if a vif number is legal/ok. This is used by ip_output, to export * numvifs there, */ static int X_legal_vif_num(vif) int vif; { if (vif >= 0 && vif < numvifs) return(1); else return(0); } #ifndef MROUTE_KLD int (*legal_vif_num)(int) = X_legal_vif_num; #endif /* * Return the local address used by this vif */ static u_long X_ip_mcast_src(vifi) int vifi; { if (vifi >= 0 && vifi < numvifs) return viftable[vifi].v_lcl_addr.s_addr; else return INADDR_ANY; } #ifndef MROUTE_KLD u_long (*ip_mcast_src)(int) = X_ip_mcast_src; #endif static void phyint_send(ip, vifp, m) struct ip *ip; struct vif *vifp; struct mbuf *m; { register struct mbuf *mb_copy; register int hlen = ip->ip_hl << 2; /* * Make a new reference to the packet; make sure that * the IP header is actually copied, not just referenced, * so that ip_output() only scribbles on the copy. */ mb_copy = m_copy(m, 0, M_COPYALL); if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) mb_copy = m_pullup(mb_copy, hlen); if (mb_copy == NULL) return; if (vifp->v_rate_limit == 0) tbf_send_packet(vifp, mb_copy); else tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len); } static void encap_send(ip, vifp, m) register struct ip *ip; register struct vif *vifp; register struct mbuf *m; { register struct mbuf *mb_copy; register struct ip *ip_copy; register int i, len = ip->ip_len; /* * copy the old packet & pullup its IP header into the * new mbuf so we can modify it. Try to fill the new * mbuf since if we don't the ethernet driver will. */ MGETHDR(mb_copy, M_DONTWAIT, MT_HEADER); if (mb_copy == NULL) return; mb_copy->m_data += max_linkhdr; mb_copy->m_len = sizeof(multicast_encap_iphdr); if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) { m_freem(mb_copy); return; } i = MHLEN - M_LEADINGSPACE(mb_copy); if (i > len) i = len; mb_copy = m_pullup(mb_copy, i); if (mb_copy == NULL) return; mb_copy->m_pkthdr.len = len + sizeof(multicast_encap_iphdr); /* * fill in the encapsulating IP header. */ ip_copy = mtod(mb_copy, struct ip *); *ip_copy = multicast_encap_iphdr; #ifdef RANDOM_IP_ID ip_copy->ip_id = ip_randomid(); #else ip_copy->ip_id = htons(ip_id++); #endif ip_copy->ip_len += len; ip_copy->ip_src = vifp->v_lcl_addr; ip_copy->ip_dst = vifp->v_rmt_addr; /* * turn the encapsulated IP header back into a valid one. */ ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr)); --ip->ip_ttl; ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; mb_copy->m_data += sizeof(multicast_encap_iphdr); ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); mb_copy->m_data -= sizeof(multicast_encap_iphdr); if (vifp->v_rate_limit == 0) tbf_send_packet(vifp, mb_copy); else tbf_control(vifp, mb_copy, ip, ip_copy->ip_len); } /* * Token bucket filter module */ static void tbf_control(vifp, m, ip, p_len) register struct vif *vifp; register struct mbuf *m; register struct ip *ip; register u_long p_len; { register struct tbf *t = vifp->v_tbf; if (p_len > MAX_BKT_SIZE) { /* drop if packet is too large */ mrtstat.mrts_pkt2large++; m_freem(m); return; } tbf_update_tokens(vifp); /* if there are enough tokens, * and the queue is empty, * send this packet out */ if (t->tbf_q_len == 0) { /* queue empty, send packet if enough tokens */ if (p_len <= t->tbf_n_tok) { t->tbf_n_tok -= p_len; tbf_send_packet(vifp, m); } else { /* queue packet and timeout till later */ tbf_queue(vifp, m); timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS); } } else if (t->tbf_q_len < t->tbf_max_q_len) { /* finite queue length, so queue pkts and process queue */ tbf_queue(vifp, m); tbf_process_q(vifp); } else { /* queue length too much, try to dq and queue and process */ if (!tbf_dq_sel(vifp, ip)) { mrtstat.mrts_q_overflow++; m_freem(m); return; } else { tbf_queue(vifp, m); tbf_process_q(vifp); } } return; } /* * adds a packet to the queue at the interface */ static void tbf_queue(vifp, m) register struct vif *vifp; register struct mbuf *m; { register int s = splnet(); register struct tbf *t = vifp->v_tbf; if (t->tbf_t == NULL) { /* Queue was empty */ t->tbf_q = m; } else { /* Insert at tail */ t->tbf_t->m_act = m; } /* Set new tail pointer */ t->tbf_t = m; #ifdef DIAGNOSTIC /* Make sure we didn't get fed a bogus mbuf */ if (m->m_act) panic("tbf_queue: m_act"); #endif m->m_act = NULL; t->tbf_q_len++; splx(s); } /* * processes the queue at the interface */ static void tbf_process_q(vifp) register struct vif *vifp; { register struct mbuf *m; register int len; register int s = splnet(); register struct tbf *t = vifp->v_tbf; /* loop through the queue at the interface and send as many packets * as possible */ while (t->tbf_q_len > 0) { m = t->tbf_q; len = mtod(m, struct ip *)->ip_len; /* determine if the packet can be sent */ if (len <= t->tbf_n_tok) { /* if so, * reduce no of tokens, dequeue the packet, * send the packet. */ t->tbf_n_tok -= len; t->tbf_q = m->m_act; if (--t->tbf_q_len == 0) t->tbf_t = NULL; m->m_act = NULL; tbf_send_packet(vifp, m); } else break; } splx(s); } static void tbf_reprocess_q(xvifp) void *xvifp; { register struct vif *vifp = xvifp; if (ip_mrouter == NULL) return; tbf_update_tokens(vifp); tbf_process_q(vifp); if (vifp->v_tbf->tbf_q_len) timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS); } /* function that will selectively discard a member of the queue * based on the precedence value and the priority */ static int tbf_dq_sel(vifp, ip) register struct vif *vifp; register struct ip *ip; { register int s = splnet(); register u_int p; register struct mbuf *m, *last; register struct mbuf **np; register struct tbf *t = vifp->v_tbf; p = priority(vifp, ip); np = &t->tbf_q; last = NULL; while ((m = *np) != NULL) { if (p > priority(vifp, mtod(m, struct ip *))) { *np = m->m_act; /* If we're removing the last packet, fix the tail pointer */ if (m == t->tbf_t) t->tbf_t = last; m_freem(m); /* it's impossible for the queue to be empty, but * we check anyway. */ if (--t->tbf_q_len == 0) t->tbf_t = NULL; splx(s); mrtstat.mrts_drop_sel++; return(1); } np = &m->m_act; last = m; } splx(s); return(0); } static void tbf_send_packet(vifp, m) register struct vif *vifp; register struct mbuf *m; { struct ip_moptions imo; int error; static struct route ro; int s = splnet(); if (vifp->v_flags & VIFF_TUNNEL) { /* If tunnel options */ ip_output(m, (struct mbuf *)0, &vifp->v_route, - IP_FORWARDING, (struct ip_moptions *)0); + IP_FORWARDING, (struct ip_moptions *)0, NULL); } else { imo.imo_multicast_ifp = vifp->v_ifp; imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; imo.imo_multicast_loop = 1; imo.imo_multicast_vif = -1; /* * Re-entrancy should not be a problem here, because * the packets that we send out and are looped back at us * should get rejected because they appear to come from * the loopback interface, thus preventing looping. */ error = ip_output(m, (struct mbuf *)0, &ro, - IP_FORWARDING, &imo); + IP_FORWARDING, &imo, NULL); if (mrtdebug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on vif %d err %d\n", vifp - viftable, error); } splx(s); } /* determine the current time and then * the elapsed time (between the last time and time now) * in milliseconds & update the no. of tokens in the bucket */ static void tbf_update_tokens(vifp) register struct vif *vifp; { struct timeval tp; register u_long tm; register int s = splnet(); register struct tbf *t = vifp->v_tbf; GET_TIME(tp); TV_DELTA(tp, t->tbf_last_pkt_t, tm); /* * This formula is actually * "time in seconds" * "bytes/second". * * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8) * * The (1000/1024) was introduced in add_vif to optimize * this divide into a shift. */ t->tbf_n_tok += tm * vifp->v_rate_limit / 1024 / 8; t->tbf_last_pkt_t = tp; if (t->tbf_n_tok > MAX_BKT_SIZE) t->tbf_n_tok = MAX_BKT_SIZE; splx(s); } static int priority(vifp, ip) register struct vif *vifp; register struct ip *ip; { register int prio; /* temporary hack; may add general packet classifier some day */ /* * The UDP port space is divided up into four priority ranges: * [0, 16384) : unclassified - lowest priority * [16384, 32768) : audio - highest priority * [32768, 49152) : whiteboard - medium priority * [49152, 65536) : video - low priority */ if (ip->ip_p == IPPROTO_UDP) { struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2)); switch (ntohs(udp->uh_dport) & 0xc000) { case 0x4000: prio = 70; break; case 0x8000: prio = 60; break; case 0xc000: prio = 55; break; default: prio = 50; break; } if (tbfdebug > 1) log(LOG_DEBUG, "port %x prio%d\n", ntohs(udp->uh_dport), prio); } else { prio = 50; } return prio; } /* * End of token bucket filter modifications */ int ip_rsvp_vif_init(so, sopt) struct socket *so; struct sockopt *sopt; { int error, i, s; if (rsvpdebug) printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n", so->so_type, so->so_proto->pr_protocol); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; /* Check mbuf. */ error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (error) return (error); if (rsvpdebug) printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n", i, rsvp_on); s = splnet(); /* Check vif. */ if (!legal_vif_num(i)) { splx(s); return EADDRNOTAVAIL; } /* Check if socket is available. */ if (viftable[i].v_rsvpd != NULL) { splx(s); return EADDRINUSE; } viftable[i].v_rsvpd = so; /* This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!viftable[i].v_rsvp_on) { viftable[i].v_rsvp_on = 1; rsvp_on++; } splx(s); return 0; } int ip_rsvp_vif_done(so, sopt) struct socket *so; struct sockopt *sopt; { int error, i, s; if (rsvpdebug) printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n", so->so_type, so->so_proto->pr_protocol); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (error) return (error); s = splnet(); /* Check vif. */ if (!legal_vif_num(i)) { splx(s); return EADDRNOTAVAIL; } if (rsvpdebug) printf("ip_rsvp_vif_done: v_rsvpd = %p so = %p\n", viftable[i].v_rsvpd, so); /* * XXX as an additional consistency check, one could make sure * that viftable[i].v_rsvpd == so, otherwise passing so as * first parameter is pretty useless. */ viftable[i].v_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (viftable[i].v_rsvp_on) { viftable[i].v_rsvp_on = 0; rsvp_on--; } splx(s); return 0; } void ip_rsvp_force_done(so) struct socket *so; { int vifi; register int s; /* Don't bother if it is not the right type of socket. */ if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return; s = splnet(); /* The socket may be attached to more than one vif...this * is perfectly legal. */ for (vifi = 0; vifi < numvifs; vifi++) { if (viftable[vifi].v_rsvpd == so) { viftable[vifi].v_rsvpd = NULL; /* This may seem silly, but we need to be sure we don't * over-decrement the RSVP counter, in case something slips up. */ if (viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 0; rsvp_on--; } } } splx(s); return; } void rsvp_input(m, off) struct mbuf *m; int off; { int vifi; register struct ip *ip = mtod(m, struct ip *); static struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET }; register int s; struct ifnet *ifp; if (rsvpdebug) printf("rsvp_input: rsvp_on %d\n",rsvp_on); /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!rsvp_on) { m_freem(m); return; } s = splnet(); if (rsvpdebug) printf("rsvp_input: check vifs\n"); #ifdef DIAGNOSTIC if (!(m->m_flags & M_PKTHDR)) panic("rsvp_input no hdr"); #endif ifp = m->m_pkthdr.rcvif; /* Find which vif the packet arrived on. */ for (vifi = 0; vifi < numvifs; vifi++) if (viftable[vifi].v_ifp == ifp) break; if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) { /* * If the old-style non-vif-associated socket is set, * then use it. Otherwise, drop packet since there * is no specific socket for this vif. */ if (ip_rsvpd != NULL) { if (rsvpdebug) printf("rsvp_input: Sending packet up old-style socket\n"); rip_input(m, off); /* xxx */ } else { if (rsvpdebug && vifi == numvifs) printf("rsvp_input: Can't find vif for packet.\n"); else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL) printf("rsvp_input: No socket defined for vif %d\n",vifi); m_freem(m); } splx(s); return; } rsvp_src.sin_addr = ip->ip_src; if (rsvpdebug && m) printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n", m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv))); if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) { if (rsvpdebug) printf("rsvp_input: Failed to append to socket\n"); } else { if (rsvpdebug) printf("rsvp_input: send packet up\n"); } splx(s); } #ifdef MROUTE_KLD static int ip_mroute_modevent(module_t mod, int type, void *unused) { int s; switch (type) { static u_long (*old_ip_mcast_src)(int); static int (*old_ip_mrouter_set)(struct socket *, struct sockopt *); static int (*old_ip_mrouter_get)(struct socket *, struct sockopt *); static int (*old_ip_mrouter_done)(void); static int (*old_ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); static int (*old_mrt_ioctl)(int, caddr_t); static int (*old_legal_vif_num)(int); case MOD_LOAD: s = splnet(); /* XXX Protect against multiple loading */ old_ip_mcast_src = ip_mcast_src; ip_mcast_src = X_ip_mcast_src; old_ip_mrouter_get = ip_mrouter_get; ip_mrouter_get = X_ip_mrouter_get; old_ip_mrouter_set = ip_mrouter_set; ip_mrouter_set = X_ip_mrouter_set; old_ip_mrouter_done = ip_mrouter_done; ip_mrouter_done = X_ip_mrouter_done; old_ip_mforward = ip_mforward; ip_mforward = X_ip_mforward; old_mrt_ioctl = mrt_ioctl; mrt_ioctl = X_mrt_ioctl; old_legal_vif_num = legal_vif_num; legal_vif_num = X_legal_vif_num; splx(s); return 0; case MOD_UNLOAD: if (ip_mrouter) return EINVAL; s = splnet(); ip_mrouter_get = old_ip_mrouter_get; ip_mrouter_set = old_ip_mrouter_set; ip_mrouter_done = old_ip_mrouter_done; ip_mforward = old_ip_mforward; mrt_ioctl = old_mrt_ioctl; legal_vif_num = old_legal_vif_num; splx(s); return 0; default: break; } return 0; } static moduledata_t ip_mroutemod = { "ip_mroute", ip_mroute_modevent, 0 }; DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY); #endif /* MROUTE_KLD */ #endif /* MROUTING */ Index: head/sys/netinet/ip_output.c =================================================================== --- head/sys/netinet/ip_output.c (revision 105193) +++ head/sys/netinet/ip_output.c (revision 105194) @@ -1,2065 +1,2059 @@ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 * $FreeBSD$ */ #define _IP_VHL #include "opt_ipfw.h" #include "opt_ipdn.h" #include "opt_ipdivert.h" #include "opt_ipfilter.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_pfil_hooks.h" #include "opt_random_ip_id.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); #ifdef IPSEC #include #include #ifdef IPSEC_DEBUG #include #else #define KEYDEBUG(lev,arg) #endif #endif /*IPSEC*/ #include #include #define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ x, (ntohl(a.s_addr)>>24)&0xFF,\ (ntohl(a.s_addr)>>16)&0xFF,\ (ntohl(a.s_addr)>>8)&0xFF,\ (ntohl(a.s_addr))&0xFF, y); u_short ip_id; static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); static struct ifnet *ip_multicast_if(struct in_addr *, int *); static void ip_mloopback (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); static int ip_getmoptions (struct sockopt *, struct ip_moptions *); static int ip_pcbopts(int, struct mbuf **, struct mbuf *); static int ip_setmoptions (struct sockopt *, struct ip_moptions **); int ip_optcopy(struct ip *, struct ip *); extern struct protosw inetsw[]; /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. */ int -ip_output(m0, opt, ro, flags, imo) +ip_output(m0, opt, ro, flags, imo, inp) struct mbuf *m0; struct mbuf *opt; struct route *ro; int flags; struct ip_moptions *imo; + struct inpcb *inp; { struct ip *ip, *mhip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m; int hlen = sizeof (struct ip); int len, off, error = 0; struct sockaddr_in *dst = NULL; /* keep compiler happy */ struct in_ifaddr *ia = NULL; int isbroadcast, sw_csum; struct in_addr pkt_dst; #ifdef IPSEC struct route iproute; - struct socket *so = NULL; struct secpolicy *sp = NULL; + struct socket *so = inp ? inp->inp_socket : NULL; #endif struct ip_fw_args args; int src_was_INADDR_ANY = 0; /* as the name says... */ #ifdef PFIL_HOOKS struct packet_filter_hook *pfh; struct mbuf *m1; int rv; #endif /* PFIL_HOOKS */ args.eh = NULL; args.rule = NULL; args.next_hop = NULL; args.divert_rule = 0; /* divert cookie */ /* Grab info from MT_TAG mbufs prepended to the chain. */ for (; m0 && m0->m_type == MT_TAG; m0 = m0->m_next) { - switch(m0->m_tag_id) { + switch(m0->_m_tag_id) { default: printf("ip_output: unrecognised MT_TAG tag %d\n", - m0->m_tag_id); + m0->_m_tag_id); break; case PACKET_TAG_DUMMYNET: /* * the packet was already tagged, so part of the * processing was already done, and we need to go down. * Get parameters from the header. */ args.rule = ((struct dn_pkt *)m0)->rule; opt = NULL ; ro = & ( ((struct dn_pkt *)m0)->ro ) ; imo = NULL ; dst = ((struct dn_pkt *)m0)->dn_dst ; ifp = ((struct dn_pkt *)m0)->ifp ; flags = ((struct dn_pkt *)m0)->flags ; break; case PACKET_TAG_DIVERT: args.divert_rule = (intptr_t)m0->m_data & 0xffff; break; case PACKET_TAG_IPFORWARD: args.next_hop = (struct sockaddr_in *)m0->m_data; break; } } m = m0; KASSERT(!m || (m->m_flags & M_PKTHDR) != 0, ("ip_output: no HDR")); - KASSERT(ro != NULL, ("ip_output: no route, proto %d", - mtod(m, struct ip *)->ip_p)); - -#ifdef IPSEC - so = ipsec_getsocket(m); - (void)ipsec_setsocket(m, NULL); -#endif if (args.rule != NULL) { /* dummynet already saw us */ ip = mtod(m, struct ip *); hlen = IP_VHL_HL(ip->ip_vhl) << 2 ; if (ro->ro_rt) ia = ifatoia(ro->ro_rt->rt_ifa); goto sendit; } if (opt) { len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; } ip = mtod(m, struct ip *); pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst; /* * Fill in IP header. */ if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2); ip->ip_off &= IP_DF; #ifdef RANDOM_IP_ID ip->ip_id = ip_randomid(); #else ip->ip_id = htons(ip_id++); #endif ipstat.ips_localout++; } else { hlen = IP_VHL_HL(ip->ip_vhl) << 2; } dst = (struct sockaddr_in *)&ro->ro_dst; /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. * The address family should also be checked in case of sharing the * cache with IPv6. */ if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || dst->sin_family != AF_INET || dst->sin_addr.s_addr != pkt_dst.s_addr)) { RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)0; } if (ro->ro_rt == 0) { bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = pkt_dst; } /* * If routing to interface only, * short circuit routing lookup. */ if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 && (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; IFP_TO_IA(ifp, ia); isbroadcast = 0; /* fool gcc */ } else { /* * If this is the case, we probably don't want to allocate * a protocol-cloned route since we didn't get one from the * ULP. This lets TCP do its thing, while not burdening * forwarding or ICMP with the overhead of cloning a route. * Of course, we still want to do any cloning requested by * the link layer, as this is probably required in all cases * for correct operation (as it is for ARP). */ if (ro->ro_rt == 0) rtalloc_ign(ro, RTF_PRCLONING); if (ro->ro_rt == 0) { ipstat.ips_noroute++; error = EHOSTUNREACH; goto bad; } ia = ifatoia(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_use++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; if (ro->ro_rt->rt_flags & RTF_HOST) isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); else isbroadcast = in_broadcast(dst->sin_addr, ifp); } if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) { struct in_multi *inm; m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "dst" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ dst = (struct sockaddr_in *)&ro->ro_dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src(imo->imo_multicast_vif); } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) ip->ip_src = IA_SIN(ia)->sin_addr; } if (ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * XXX * delayed checksums are not currently * compatible with IP multicast routing */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } } IN_LOOKUP_MULTI(pkt_dst, ifp, inm); if (inm != NULL && (imo == NULL || imo->imo_multicast_loop)) { /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not * forbid loopback, loop back a copy. */ ip_mloopback(ifp, m, dst, hlen); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * Check if rsvp daemon is running. If not, don't * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!rsvp_on) imo = NULL; if (ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } #ifndef notdef /* * If the source address is not specified yet, use the address * of the outoing interface. In case, keep note we did that, so * if the the firewall changes the next-hop causing the output * interface to change, we can fix that. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) { ip->ip_src = IA_SIN(ia)->sin_addr; src_was_INADDR_ANY = 1; } } #endif /* notdef */ /* * Verify that we have any chance at all of being able to queue * the packet or packet fragments */ if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= ifp->if_snd.ifq_maxlen) { error = ENOBUFS; ipstat.ips_odropped++; goto bad; } /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if ((u_short)ip->ip_len > ifp->if_mtu) { error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #ifdef IPSEC /* get SP for this packet */ if (so == NULL) sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error); else sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error); if (sp == NULL) { ipsecstat.out_inval++; goto bad; } error = 0; /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: /* * This packet is just discarded. */ ipsecstat.out_polvio++; goto bad; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: /* no need to do IPsec. */ goto skip_ipsec; case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { /* acquire a policy */ error = key_spdacquire(sp); goto bad; } break; case IPSEC_POLICY_ENTRUST: default: printf("ip_output: Invalid policy found. %d\n", sp->policy); } { struct ipsec_output_state state; bzero(&state, sizeof(state)); state.m = m; if (flags & IP_ROUTETOIF) { state.ro = &iproute; bzero(&iproute, sizeof(iproute)); } else state.ro = ro; state.dst = (struct sockaddr *)dst; ip->ip_sum = 0; /* * XXX * delayed checksums are not currently compatible with IPsec */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); error = ipsec4_output(&state, sp, flags); m = state.m; if (flags & IP_ROUTETOIF) { /* * if we have tunnel mode SA, we may need to ignore * IP_ROUTETOIF. */ if (state.ro != &iproute || state.ro->ro_rt != NULL) { flags &= ~IP_ROUTETOIF; ro = state.ro; } } else ro = state.ro; dst = (struct sockaddr_in *)state.dst; if (error) { /* mbuf is already reclaimed in ipsec4_output. */ m0 = NULL; switch (error) { case EHOSTUNREACH: case ENETUNREACH: case EMSGSIZE: case ENOBUFS: case ENOMEM: break; default: printf("ip4_output (ipsec): error code %d\n", error); /*fall through*/ case ENOENT: /* don't show these error codes to the user */ error = 0; break; } goto bad; } } /* be sure to update variables that are affected by ipsec4_output() */ ip = mtod(m, struct ip *); #ifdef _IP_VHL hlen = IP_VHL_HL(ip->ip_vhl) << 2; #else hlen = ip->ip_hl << 2; #endif if (ro->ro_rt == NULL) { if ((flags & IP_ROUTETOIF) == 0) { printf("ip_output: " "can't update route after IPsec processing\n"); error = EHOSTUNREACH; /*XXX*/ goto bad; } } else { ia = ifatoia(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; } /* make it flipped, again. */ ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); skip_ipsec: #endif /*IPSEC*/ /* * IpHack's section. * - Xlate: translate packet's addr/port (NAT). * - Firewall: deny/allow/etc. * - Wrap: fake packet's addr/port * - Encapsulate: put it in another IP and send out. */ #ifdef PFIL_HOOKS /* * Run through list of hooks for output packets. */ m1 = m; pfh = pfil_hook_get(PFIL_OUT, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh); for (; pfh; pfh = TAILQ_NEXT(pfh, pfil_link)) if (pfh->pfil_func) { rv = pfh->pfil_func(ip, hlen, ifp, 1, &m1); if (rv) { error = EHOSTUNREACH; goto done; } m = m1; if (m == NULL) goto done; ip = mtod(m, struct ip *); } #endif /* PFIL_HOOKS */ /* * Check with the firewall... * but not if we are already being fwd'd from a firewall. */ if (fw_enable && IPFW_LOADED && !args.next_hop) { struct sockaddr_in *old = dst; args.m = m; args.next_hop = dst; args.oif = ifp; off = ip_fw_chk_ptr(&args); m = args.m; dst = args.next_hop; /* * On return we must do the following: * m == NULL -> drop the pkt (old interface, deprecated) * (off & IP_FW_PORT_DENY_FLAG) -> drop the pkt (new interface) * 1<=off<= 0xffff -> DIVERT * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe * (off & IP_FW_PORT_TEE_FLAG) -> TEE the packet * dst != old -> IPFIREWALL_FORWARD * off==0, dst==old -> accept * If some of the above modules are not compiled in, then * we should't have to check the corresponding condition * (because the ipfw control socket should not accept * unsupported rules), but better play safe and drop * packets in case of doubt. */ if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) { if (m) m_freem(m); error = EACCES; goto done; } ip = mtod(m, struct ip *); if (off == 0 && dst == old) /* common case */ goto pass; if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) { /* * pass the pkt to dummynet. Need to include * pipe number, m, ifp, ro, dst because these are * not recomputed in the next pass. * All other parameters have been already used and * so they are not needed anymore. * XXX note: if the ifp or ro entry are deleted * while a pkt is in dummynet, we are in trouble! */ args.ro = ro; args.dst = dst; args.flags = flags; error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT, &args); goto done; } #ifdef IPDIVERT if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) { struct mbuf *clone = NULL; /* Clone packet if we're doing a 'tee' */ if ((off & IP_FW_PORT_TEE_FLAG) != 0) clone = m_dup(m, M_DONTWAIT); /* * XXX * delayed checksums are not currently compatible * with divert sockets. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } /* Restore packet header fields to original values */ ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); /* Deliver packet to divert input routine */ divert_packet(m, 0, off & 0xffff, args.divert_rule); /* If 'tee', continue with original packet */ if (clone != NULL) { m = clone; ip = mtod(m, struct ip *); goto pass; } goto done; } #endif /* IPFIREWALL_FORWARD */ /* * Check dst to make sure it is directly reachable on the * interface we previously thought it was. * If it isn't (which may be likely in some situations) we have * to re-route it (ie, find a route for the next-hop and the * associated interface) and set them here. This is nested * forwarding which in most cases is undesirable, except where * such control is nigh impossible. So we do it here. * And I'm babbling. */ if (off == 0 && old != dst) { /* FORWARD, dst has changed */ #if 0 /* * XXX To improve readability, this block should be * changed into a function call as below: */ error = ip_ipforward(&m, &dst, &ifp); if (error) goto bad; if (m == NULL) /* ip_input consumed the mbuf */ goto done; #else struct in_ifaddr *ia; /* * XXX sro_fwd below is static, and a pointer * to it gets passed to routines downstream. * This could have surprisingly bad results in * practice, because its content is overwritten * by subsequent packets. */ /* There must be a better way to do this next line... */ static struct route sro_fwd; struct route *ro_fwd = &sro_fwd; #if 0 print_ip("IPFIREWALL_FORWARD: New dst ip: ", dst->sin_addr, "\n"); #endif /* * We need to figure out if we have been forwarded * to a local socket. If so, then we should somehow * "loop back" to ip_input, and get directed to the * PCB as if we had received this packet. This is * because it may be dificult to identify the packets * you want to forward until they are being output * and have selected an interface. (e.g. locally * initiated packets) If we used the loopback inteface, * we would not be able to control what happens * as the packet runs through ip_input() as * it is done through a ISR. */ LIST_FOREACH(ia, INADDR_HASH(dst->sin_addr.s_addr), ia_hash) { /* * If the addr to forward to is one * of ours, we pretend to * be the destination for this packet. */ if (IA_SIN(ia)->sin_addr.s_addr == dst->sin_addr.s_addr) break; } if (ia) { /* tell ip_input "dont filter" */ struct m_hdr tag; tag.mh_type = MT_TAG; tag.mh_flags = PACKET_TAG_IPFORWARD; tag.mh_data = (caddr_t)args.next_hop; tag.mh_next = m; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = ifunit("lo0"); if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m0->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip_input((struct mbuf *)&tag); goto done; } /* Some of the logic for this was * nicked from above. * * This rewrites the cached route in a local PCB. * Is this what we want to do? */ bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst)); ro_fwd->ro_rt = 0; rtalloc_ign(ro_fwd, RTF_PRCLONING); if (ro_fwd->ro_rt == 0) { ipstat.ips_noroute++; error = EHOSTUNREACH; goto bad; } ia = ifatoia(ro_fwd->ro_rt->rt_ifa); ifp = ro_fwd->ro_rt->rt_ifp; ro_fwd->ro_rt->rt_use++; if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *) ro_fwd->ro_rt->rt_gateway; if (ro_fwd->ro_rt->rt_flags & RTF_HOST) isbroadcast = (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST); else isbroadcast = in_broadcast(dst->sin_addr, ifp); if (ro->ro_rt) RTFREE(ro->ro_rt); ro->ro_rt = ro_fwd->ro_rt; dst = (struct sockaddr_in *)&ro_fwd->ro_dst; #endif /* ... block to be put into a function */ /* * If we added a default src ip earlier, * which would have been gotten from the-then * interface, do it again, from the new one. */ if (src_was_INADDR_ANY) ip->ip_src = IA_SIN(ia)->sin_addr; goto pass ; } /* * if we get here, none of the above matches, and * we have to drop the pkt */ m_freem(m); error = EACCES; /* not sure this is the right error msg */ goto done; } pass: /* 127/8 must not appear on wire - RFC1122. */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { ipstat.ips_badaddr++; error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; if (sw_csum & CSUM_DELAY_DATA) { in_delayed_cksum(m); sw_csum &= ~CSUM_DELAY_DATA; } m->m_pkthdr.csum_flags &= ifp->if_hwassist; /* * If small enough for interface, or the interface will take * care of the fragmentation for us, can just send directly. */ if ((u_short)ip->ip_len <= ifp->if_mtu || ifp->if_hwassist & CSUM_FRAGMENT) { ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { if (ip->ip_vhl == IP_VHL_BORING) { ip->ip_sum = in_cksum_hdr(ip); } else { ip->ip_sum = in_cksum(m, hlen); } } /* Record statistics for this interface address. */ if (!(flags & IP_FORWARDING) && ia) { ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } #ifdef IPSEC /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro->ro_rt); goto done; } /* * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. */ if (ip->ip_off & IP_DF) { error = EMSGSIZE; /* * This case can happen if the user changed the MTU * of an interface after enabling IP on it. Because * most netifs don't keep track of routes pointing to * them, there is no way for one to update all its * routes when the MTU is changed. */ if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; } ipstat.ips_cantfrag++; goto bad; } len = (ifp->if_mtu - hlen) &~ 7; if (len < 8) { error = EMSGSIZE; goto bad; } /* * if the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA && (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } if (len > PAGE_SIZE) { /* * Fragement large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. */ int newlen; struct mbuf *mtmp; for (mtmp = m, off = 0; mtmp && ((off + mtmp->m_len) <= ifp->if_mtu); mtmp = mtmp->m_next) { off += mtmp->m_len; } /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & ifp->if_mtu; if ((newlen + sizeof (struct ip)) > ifp->if_mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } /* printf("ipfrag: len = %d, hlen = %d, mhlen = %d, newlen = %d, off = %d\n", len, hlen, sizeof (struct ip), newlen, off);*/ len = newlen; } else { off = hlen + len; } { int mhlen, firstlen = off - hlen; struct mbuf **mnext = &m->m_nextpkt; int nfrags = 1; /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. */ m0 = m; mhlen = sizeof (struct ip); for (; off < (u_short)ip->ip_len; off += len) { MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == 0) { error = ENOBUFS; ipstat.ips_odropped++; goto sendorfree; } m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2); } m->m_len = mhlen; mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; if (off + len >= (u_short)ip->ip_len) len = (u_short)ip->ip_len - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copy(m0, off, len); if (m->m_next == 0) { (void) m_free(m); error = ENOBUFS; /* ??? */ ipstat.ips_odropped++; goto sendorfree; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_create_fragment(m0, m); #endif m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { if (mhip->ip_vhl == IP_VHL_BORING) { mhip->ip_sum = in_cksum_hdr(mhip); } else { mhip->ip_sum = in_cksum(m, mhlen); } } *mnext = m; mnext = &m->m_nextpkt; nfrags++; } ipstat.ips_ofragments += nfrags; /* set first/last markers for fragment chain */ m->m_flags |= M_LASTFRAG; m0->m_flags |= M_FIRSTFRAG | M_FRAG; m0->m_pkthdr.csum_data = nfrags; /* * Update first fragment by trimming what's been copied out * and updating header, then send each fragment (in order). */ m = m0; m_adj(m, hlen + firstlen - (u_short)ip->ip_len); m->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m->m_pkthdr.len); ip->ip_off |= IP_MF; ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { if (ip->ip_vhl == IP_VHL_BORING) { ip->ip_sum = in_cksum_hdr(ip); } else { ip->ip_sum = in_cksum(m, hlen); } } sendorfree: for (m = m0; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; #ifdef IPSEC /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro->ro_rt); } else m_freem(m); } if (error == 0) ipstat.ips_fragmented++; } done: #ifdef IPSEC if (ro == &iproute && ro->ro_rt) { RTFREE(ro->ro_rt); ro->ro_rt = NULL; } if (sp != NULL) { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP ip_output call free SP:%p\n", sp)); key_freesp(sp); } #endif /* IPSEC */ return (error); bad: m_freem(m); goto done; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; u_short csum, offset; ip = mtod(m, struct ip *); offset = IP_VHL_HL(ip->ip_vhl) << 2 ; csum = in_cksum_skip(m, ip->ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(u_short) > m->m_len) { printf("delayed m_pullup, m->len: %d off: %d p: %d\n", m->m_len, offset, ip->ip_p); /* * XXX * this shouldn't happen, but if it does, the * correct behavior may be to insert the checksum * in the existing chain instead of rearranging it. */ m = m_pullup(m, offset + sizeof(u_short)); } *(u_short *)(m->m_data + offset) = csum; } /* * Insert IP options into preformed packet. * Adjust IP destination as required for IP source routing, * as indicated by a non-zero in_addr at the start of the options. * * XXX This routine assumes that the packet has no options in place. */ static struct mbuf * ip_insertoptions(m, opt, phlen) register struct mbuf *m; struct mbuf *opt; int *phlen; { register struct ipoption *p = mtod(opt, struct ipoption *); struct mbuf *n; register struct ip *ip = mtod(m, struct ip *); unsigned optlen; optlen = opt->m_len - sizeof(p->ipopt_dst); if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) { *phlen = 0; return (m); /* XXX should fail */ } if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { MGETHDR(n, M_DONTWAIT, MT_HEADER); if (n == 0) { *phlen = 0; return (m); } n->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_create_mbuf_from_mbuf(m, n); #endif n->m_pkthdr.len = m->m_pkthdr.len + optlen; m->m_len -= sizeof(struct ip); m->m_data += sizeof(struct ip); n->m_next = m; m = n; m->m_len = optlen + sizeof(struct ip); m->m_data += max_linkhdr; (void)memcpy(mtod(m, void *), ip, sizeof(struct ip)); } else { m->m_data -= optlen; m->m_len += optlen; m->m_pkthdr.len += optlen; ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); } ip = mtod(m, struct ip *); bcopy(p->ipopt_list, ip + 1, optlen); *phlen = sizeof(struct ip) + optlen; ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2); ip->ip_len += optlen; return (m); } /* * Copy options from ip to jp, * omitting those not copied during fragmentation. */ int ip_optcopy(ip, jp) struct ip *ip, *jp; { register u_char *cp, *dp; int opt, optlen, cnt; cp = (u_char *)(ip + 1); dp = (u_char *)(jp + 1); cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) { /* Preserve for IP mcast tunnel's LSRR alignment. */ *dp++ = IPOPT_NOP; optlen = 1; continue; } KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp), ("ip_optcopy: malformed ipv4 option")); optlen = cp[IPOPT_OLEN]; KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt, ("ip_optcopy: malformed ipv4 option")); /* bogus lengths should have been caught by ip_dooptions */ if (optlen > cnt) optlen = cnt; if (IPOPT_COPIED(opt)) { bcopy(cp, dp, optlen); dp += optlen; } } for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) *dp++ = IPOPT_EOL; return (optlen); } /* * IP socket option processing. */ int ip_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { struct inpcb *inp = sotoinpcb(so); int error, optval; error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { return (EINVAL); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_HEADER); if (m == 0) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); return (ip_pcbopts(sopt->sopt_name, &inp->inp_options, m)); } case IP_TOS: case IP_TTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVIF: case IP_FAITH: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; #define OPTSET(bit) \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_FAITH: OPTSET(INP_FAITH); break; } break; #undef OPTSET case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: error = ip_setmoptions(sopt, &inp->inp_moptions); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } break; #ifdef IPSEC case IP_IPSEC_POLICY: { caddr_t req; size_t len = 0; int priv; struct mbuf *m; int optname; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; priv = (sopt->sopt_td != NULL && suser(sopt->sopt_td) != 0) ? 0 : 1; req = mtod(m, caddr_t); len = m->m_len; optname = sopt->sopt_name; error = ipsec4_set_policy(inp, optname, req, len, priv); m_freem(m); break; } #endif /*IPSEC*/ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: if (inp->inp_options) error = sooptcopyout(sopt, mtod(inp->inp_options, char *), inp->inp_options->m_len); else sopt->sopt_valsize = 0; break; case IP_TOS: case IP_TTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVIF: case IP_PORTRANGE: case IP_FAITH: switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_FAITH: optval = OPTBIT(INP_FAITH); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: error = ip_getmoptions(sopt, inp->inp_moptions); break; #ifdef IPSEC case IP_IPSEC_POLICY: { struct mbuf *m = NULL; caddr_t req = NULL; size_t len = 0; if (m != 0) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) m_freem(m); break; } #endif /*IPSEC*/ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Set up IP options in pcb for insertion in output packets. * Store in mbuf with pointer in pcbopt, adding pseudo-option * with destination address if source routed. */ static int ip_pcbopts(optname, pcbopt, m) int optname; struct mbuf **pcbopt; register struct mbuf *m; { register int cnt, optlen; register u_char *cp; u_char opt; /* turn off any old options */ if (*pcbopt) (void)m_free(*pcbopt); *pcbopt = 0; if (m == (struct mbuf *)0 || m->m_len == 0) { /* * Only turning off any previous options. */ if (m) (void)m_free(m); return (0); } if (m->m_len % sizeof(int32_t)) goto bad; /* * IP first-hop destination address will be stored before * actual options; move other options back * and clear it when none present. */ if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) goto bad; cnt = m->m_len; m->m_len += sizeof(struct in_addr); cp = mtod(m, u_char *) + sizeof(struct in_addr); ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt); bzero(mtod(m, caddr_t), sizeof(struct in_addr)); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) goto bad; optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) goto bad; } switch (opt) { default: break; case IPOPT_LSRR: case IPOPT_SSRR: /* * user process specifies route as: * ->A->B->C->D * D must be our final destination (but we can't * check that since we may not have connected yet). * A is first hop destination, which doesn't appear in * actual IP option, but is stored before the options. */ if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) goto bad; m->m_len -= sizeof(struct in_addr); cnt -= sizeof(struct in_addr); optlen -= sizeof(struct in_addr); cp[IPOPT_OLEN] = optlen; /* * Move first hop before start of options. */ bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), sizeof(struct in_addr)); /* * Then copy rest of options back * to close up the deleted entry. */ ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)), (caddr_t)&cp[IPOPT_OFFSET+1], (unsigned)cnt + sizeof(struct in_addr)); break; } } if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) goto bad; *pcbopt = m; return (0); bad: (void)m_free(m); return (EINVAL); } /* * XXX * The whole multicast option thing needs to be re-thought. * Several of these options are equally applicable to non-multicast * transmission, and one (IP_MULTICAST_TTL) totally duplicates a * standard option (IP_TTL). */ /* * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. */ static struct ifnet * ip_multicast_if(a, ifindexp) struct in_addr *a; int *ifindexp; { int ifindex; struct ifnet *ifp; if (ifindexp) *ifindexp = 0; if (ntohl(a->s_addr) >> 24 == 0) { ifindex = ntohl(a->s_addr) & 0xffffff; if (ifindex < 0 || if_index < ifindex) return NULL; ifp = ifnet_byindex(ifindex); if (ifindexp) *ifindexp = ifindex; } else { INADDR_TO_IFP(*a, ifp); } return ifp; } /* * Set the IP multicast options in response to user setsockopt(). */ static int ip_setmoptions(sopt, imop) struct sockopt *sopt; struct ip_moptions **imop; { int error = 0; int i; struct in_addr addr; struct ip_mreq mreq; struct ifnet *ifp; struct ip_moptions *imo = *imop; struct route ro; struct sockaddr_in *dst; int ifindex; int s; if (imo == NULL) { /* * No multicast option buffer attached to the pcb; * allocate one and initialize to default values. */ imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); if (imo == NULL) return (ENOBUFS); *imop = imo; imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; imo->imo_num_memberships = 0; } switch (sopt->sopt_name) { /* store an index number for the vif you wanna use in the send */ case IP_MULTICAST_VIF: if (legal_vif_num == 0) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (error) break; if (!legal_vif_num(i) && (i != -1)) { error = EINVAL; break; } imo->imo_multicast_vif = i; break; case IP_MULTICAST_IF: /* * Select the interface for outgoing multicast packets. */ error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); if (error) break; /* * INADDR_ANY is used to remove a previous selection. * When no interface is selected, a default one is * chosen every time a multicast packet is sent. */ if (addr.s_addr == INADDR_ANY) { imo->imo_multicast_ifp = NULL; break; } /* * The selected interface is identified by its local * IP address. Find the interface and confirm that * it supports multicasting. */ s = splimp(); ifp = ip_multicast_if(&addr, &ifindex); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { splx(s); error = EADDRNOTAVAIL; break; } imo->imo_multicast_ifp = ifp; if (ifindex) imo->imo_multicast_addr = addr; else imo->imo_multicast_addr.s_addr = INADDR_ANY; splx(s); break; case IP_MULTICAST_TTL: /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == 1) { u_char ttl; error = sooptcopyin(sopt, &ttl, 1, 1); if (error) break; imo->imo_multicast_ttl = ttl; } else { u_int ttl; error = sooptcopyin(sopt, &ttl, sizeof ttl, sizeof ttl); if (error) break; if (ttl > 255) error = EINVAL; else imo->imo_multicast_ttl = ttl; } break; case IP_MULTICAST_LOOP: /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == 1) { u_char loop; error = sooptcopyin(sopt, &loop, 1, 1); if (error) break; imo->imo_multicast_loop = !!loop; } else { u_int loop; error = sooptcopyin(sopt, &loop, sizeof loop, sizeof loop); if (error) break; imo->imo_multicast_loop = !!loop; } break; case IP_ADD_MEMBERSHIP: /* * Add a multicast group membership. * Group must be a valid IP multicast address. */ error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); if (error) break; if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { error = EINVAL; break; } s = splimp(); /* * If no interface address was provided, use the interface of * the route to the given multicast address. */ if (mreq.imr_interface.s_addr == INADDR_ANY) { bzero((caddr_t)&ro, sizeof(ro)); dst = (struct sockaddr_in *)&ro.ro_dst; dst->sin_len = sizeof(*dst); dst->sin_family = AF_INET; dst->sin_addr = mreq.imr_multiaddr; rtalloc(&ro); if (ro.ro_rt == NULL) { error = EADDRNOTAVAIL; splx(s); break; } ifp = ro.ro_rt->rt_ifp; rtfree(ro.ro_rt); } else { ifp = ip_multicast_if(&mreq.imr_interface, NULL); } /* * See if we found an interface, and confirm that it * supports multicast. */ if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; splx(s); break; } /* * See if the membership already exists or if all the * membership slots are full. */ for (i = 0; i < imo->imo_num_memberships; ++i) { if (imo->imo_membership[i]->inm_ifp == ifp && imo->imo_membership[i]->inm_addr.s_addr == mreq.imr_multiaddr.s_addr) break; } if (i < imo->imo_num_memberships) { error = EADDRINUSE; splx(s); break; } if (i == IP_MAX_MEMBERSHIPS) { error = ETOOMANYREFS; splx(s); break; } /* * Everything looks good; add a new record to the multicast * address list for the given interface. */ if ((imo->imo_membership[i] = in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) { error = ENOBUFS; splx(s); break; } ++imo->imo_num_memberships; splx(s); break; case IP_DROP_MEMBERSHIP: /* * Drop a multicast group membership. * Group must be a valid IP multicast address. */ error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); if (error) break; if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { error = EINVAL; break; } s = splimp(); /* * If an interface address was specified, get a pointer * to its ifnet structure. */ if (mreq.imr_interface.s_addr == INADDR_ANY) ifp = NULL; else { ifp = ip_multicast_if(&mreq.imr_interface, NULL); if (ifp == NULL) { error = EADDRNOTAVAIL; splx(s); break; } } /* * Find the membership in the membership array. */ for (i = 0; i < imo->imo_num_memberships; ++i) { if ((ifp == NULL || imo->imo_membership[i]->inm_ifp == ifp) && imo->imo_membership[i]->inm_addr.s_addr == mreq.imr_multiaddr.s_addr) break; } if (i == imo->imo_num_memberships) { error = EADDRNOTAVAIL; splx(s); break; } /* * Give up the multicast address record to which the * membership points. */ in_delmulti(imo->imo_membership[i]); /* * Remove the gap in the membership array. */ for (++i; i < imo->imo_num_memberships; ++i) imo->imo_membership[i-1] = imo->imo_membership[i]; --imo->imo_num_memberships; splx(s); break; default: error = EOPNOTSUPP; break; } /* * If all options have default values, no need to keep the mbuf. */ if (imo->imo_multicast_ifp == NULL && imo->imo_multicast_vif == -1 && imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && imo->imo_num_memberships == 0) { free(*imop, M_IPMOPTS); *imop = NULL; } return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ static int ip_getmoptions(sopt, imo) struct sockopt *sopt; register struct ip_moptions *imo; { struct in_addr addr; struct in_ifaddr *ia; int error, optval; u_char coptval; error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_IF: if (imo == NULL || imo->imo_multicast_ifp == NULL) addr.s_addr = INADDR_ANY; else if (imo->imo_multicast_addr.s_addr) { /* return the value user has set */ addr = imo->imo_multicast_addr; } else { IFP_TO_IA(imo->imo_multicast_ifp, ia); addr.s_addr = (ia == NULL) ? INADDR_ANY : IA_SIN(ia)->sin_addr.s_addr; } error = sooptcopyout(sopt, &addr, sizeof addr); break; case IP_MULTICAST_TTL: if (imo == 0) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; if (sopt->sopt_valsize == 1) error = sooptcopyout(sopt, &coptval, 1); else error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_LOOP: if (imo == 0) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; if (sopt->sopt_valsize == 1) error = sooptcopyout(sopt, &coptval, 1); else error = sooptcopyout(sopt, &optval, sizeof optval); break; default: error = ENOPROTOOPT; break; } return (error); } /* * Discard the IP multicast options. */ void ip_freemoptions(imo) register struct ip_moptions *imo; { register int i; if (imo != NULL) { for (i = 0; i < imo->imo_num_memberships; ++i) in_delmulti(imo->imo_membership[i]); free(imo, M_IPMOPTS); } } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(ifp, m, dst, hlen) struct ifnet *ifp; register struct mbuf *m; register struct sockaddr_in *dst; int hlen; { register struct ip *ip; struct mbuf *copym; copym = m_copy(m, 0, M_COPYALL); if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (ip->ip_vhl == IP_VHL_BORING) { ip->ip_sum = in_cksum_hdr(ip); } else { ip->ip_sum = in_cksum(copym, hlen); } /* * NB: * It's not clear whether there are any lingering * reentrancy problems in other areas which might * be exposed by using ip_input directly (in * particular, everything which modifies the packet * in-place). Yet another option is using the * protosw directly to deliver the looped back * packet. For the moment, we'll err on the side * of safety by using if_simloop(). */ #if 1 /* XXX */ if (dst->sin_family != AF_INET) { printf("ip_mloopback: bad address family %d\n", dst->sin_family); dst->sin_family = AF_INET; } #endif #ifdef notdef copym->m_pkthdr.rcvif = ifp; ip_input(copym); #else /* if the checksum hasn't been computed, mark it as valid */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } if_simloop(ifp, copym, dst->sin_family, 0); #endif } } Index: head/sys/netinet/ip_var.h =================================================================== --- head/sys/netinet/ip_var.h (revision 105193) +++ head/sys/netinet/ip_var.h (revision 105194) @@ -1,210 +1,211 @@ /* * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_var.h 8.2 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _NETINET_IP_VAR_H_ #define _NETINET_IP_VAR_H_ #include #ifdef _KERNEL #include #endif /* * Overlay for ip header used by other protocols (tcp, udp). */ struct ipovly { u_char ih_x1[9]; /* (unused) */ u_char ih_pr; /* protocol */ u_short ih_len; /* protocol length */ struct in_addr ih_src; /* source internet address */ struct in_addr ih_dst; /* destination internet address */ }; #ifdef _KERNEL /* * Ip reassembly queue structure. Each fragment * being reassembled is attached to one of these structures. * They are timed out after ipq_ttl drops to 0, and may also * be reclaimed if memory becomes tight. */ struct ipq { TAILQ_ENTRY(ipq) ipq_list; /* to other reass headers */ u_char ipq_ttl; /* time for reass q to live */ u_char ipq_p; /* protocol of this fragment */ u_short ipq_id; /* sequence id for reassembly */ struct mbuf *ipq_frags; /* to ip headers of fragments */ struct in_addr ipq_src,ipq_dst; #ifdef IPDIVERT u_int32_t ipq_div_info; /* ipfw divert port & flags */ u_int16_t ipq_div_cookie; /* ipfw divert cookie */ #endif struct label ipq_label; /* MAC label */ }; #endif /* _KERNEL */ /* * Structure stored in mbuf in inpcb.ip_options * and passed to ip_output when ip options are in use. * The actual length of the options (including ipopt_dst) * is in m_len. */ #define MAX_IPOPTLEN 40 struct ipoption { struct in_addr ipopt_dst; /* first-hop dst if source routed */ char ipopt_list[MAX_IPOPTLEN]; /* options proper */ }; /* * Structure attached to inpcb.ip_moptions and * passed to ip_output when IP multicast options are in use. */ struct ip_moptions { struct ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */ struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */ u_char imo_multicast_ttl; /* TTL for outgoing multicasts */ u_char imo_multicast_loop; /* 1 => hear sends if a member */ u_short imo_num_memberships; /* no. memberships this socket */ struct in_multi *imo_membership[IP_MAX_MEMBERSHIPS]; u_long imo_multicast_vif; /* vif num outgoing multicasts */ }; struct ipstat { u_long ips_total; /* total packets received */ u_long ips_badsum; /* checksum bad */ u_long ips_tooshort; /* packet too short */ u_long ips_toosmall; /* not enough data */ u_long ips_badhlen; /* ip header length < data size */ u_long ips_badlen; /* ip length < ip header length */ u_long ips_fragments; /* fragments received */ u_long ips_fragdropped; /* frags dropped (dups, out of space) */ u_long ips_fragtimeout; /* fragments timed out */ u_long ips_forward; /* packets forwarded */ u_long ips_fastforward; /* packets fast forwarded */ u_long ips_cantforward; /* packets rcvd for unreachable dest */ u_long ips_redirectsent; /* packets forwarded on same net */ u_long ips_noproto; /* unknown or unsupported protocol */ u_long ips_delivered; /* datagrams delivered to upper level*/ u_long ips_localout; /* total ip packets generated here */ u_long ips_odropped; /* lost packets due to nobufs, etc. */ u_long ips_reassembled; /* total packets reassembled ok */ u_long ips_fragmented; /* datagrams successfully fragmented */ u_long ips_ofragments; /* output fragments created */ u_long ips_cantfrag; /* don't fragment flag was set, etc. */ u_long ips_badoptions; /* error in option processing */ u_long ips_noroute; /* packets discarded due to no route */ u_long ips_badvers; /* ip version != 4 */ u_long ips_rawout; /* total raw ip packets generated */ u_long ips_toolong; /* ip length > max ip packet size */ u_long ips_notmember; /* multicasts for unregistered grps */ u_long ips_nogif; /* no match gif found */ u_long ips_badaddr; /* invalid address on header */ }; #ifdef _KERNEL /* flags passed to ip_output as last parameter */ #define IP_FORWARDING 0x1 /* most of ip header exists */ #define IP_RAWOUTPUT 0x2 /* raw ip header exists */ #define IP_ROUTETOIF SO_DONTROUTE /* bypass routing tables */ #define IP_ALLOWBROADCAST SO_BROADCAST /* can send broadcast packets */ struct ip; struct inpcb; struct route; struct sockopt; extern struct ipstat ipstat; #ifndef RANDOM_IP_ID extern u_short ip_id; /* ip packet ctr, for ids */ #endif extern int ip_defttl; /* default IP ttl */ extern int ipforwarding; /* ip forwarding */ extern struct route ipforward_rt; /* ip forwarding cached route */ extern u_char ip_protox[]; extern struct socket *ip_rsvpd; /* reservation protocol daemon */ extern struct socket *ip_mrouter; /* multicast routing daemon */ extern int (*legal_vif_num)(int); extern u_long (*ip_mcast_src)(int); extern int rsvp_on; extern struct pr_usrreqs rip_usrreqs; int ip_ctloutput(struct socket *, struct sockopt *sopt); void ip_drain(void); void ip_freemoptions(struct ip_moptions *); void ip_init(void); extern int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); int ip_output(struct mbuf *, - struct mbuf *, struct route *, int, struct ip_moptions *); + struct mbuf *, struct route *, int, struct ip_moptions *, + struct inpcb *); struct in_ifaddr * ip_rtaddr(struct in_addr, struct route *); void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, struct mbuf *); void ip_slowtimo(void); struct mbuf * ip_srcroute(void); void ip_stripoptions(struct mbuf *, struct mbuf *); #ifdef RANDOM_IP_ID u_int16_t ip_randomid(void); #endif int rip_ctloutput(struct socket *, struct sockopt *); void rip_ctlinput(int, struct sockaddr *, void *); void rip_init(void); void rip_input(struct mbuf *, int); int rip_output(struct mbuf *, struct socket *, u_long); void ipip_input(struct mbuf *, int); void rsvp_input(struct mbuf *, int); int ip_rsvp_init(struct socket *); int ip_rsvp_done(void); int ip_rsvp_vif_init(struct socket *, struct sockopt *); int ip_rsvp_vif_done(struct socket *, struct sockopt *); void ip_rsvp_force_done(struct socket *); #ifdef IPDIVERT void div_init(void); void div_input(struct mbuf *, int); void divert_packet(struct mbuf *m, int incoming, int port, int rule); extern struct pr_usrreqs div_usrreqs; #endif void in_delayed_cksum(struct mbuf *m); #endif /* _KERNEL */ #endif /* !_NETINET_IP_VAR_H_ */ Index: head/sys/netinet/raw_ip.c =================================================================== --- head/sys/netinet/raw_ip.c (revision 105193) +++ head/sys/netinet/raw_ip.c (revision 105194) @@ -1,732 +1,725 @@ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 * $FreeBSD$ */ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_random_ip_id.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define _IP_VHL #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #endif /*IPSEC*/ struct inpcbhead ripcb; struct inpcbinfo ripcbinfo; /* control hooks for ipfw and dummynet */ ip_fw_ctl_t *ip_fw_ctl_ptr; ip_dn_ctl_t *ip_dn_ctl_ptr; /* * Nominal space allocated to a raw ip socket. */ #define RIPSNDQ 8192 #define RIPRCVQ 8192 /* * Raw interface to IP protocol. */ /* * Initialize raw connection block q. */ void rip_init() { INP_INFO_LOCK_INIT(&ripcbinfo, "rip"); LIST_INIT(&ripcb); ripcbinfo.listhead = &ripcb; /* * XXX We don't use the hash list for raw IP, but it's easier * to allocate a one entry hash list than it is to check all * over the place for hashbase == NULL. */ ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask); ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask); ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets); } static struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET }; /* * Setup generic address and protocol structures * for raw_input routine, then pass them along with * mbuf chain. */ void rip_input(m, off) struct mbuf *m; int off; { register struct ip *ip = mtod(m, struct ip *); register struct inpcb *inp; struct inpcb *last = 0; struct mbuf *opts = 0; int proto = ip->ip_p; ripsrc.sin_addr = ip->ip_src; LIST_FOREACH(inp, &ripcb, inp_list) { #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_ip_p && inp->inp_ip_p != proto) continue; if (inp->inp_laddr.s_addr && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr && inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; if (last) { struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); int policyfail = 0; if (n != NULL) { #ifdef IPSSEC /* check AH/ESP integrity. */ if (ipsec4_in_reject_so(n, last->inp_socket)) { policyfail = 1; ipsecstat.in_polvio++; /* do not inject data to pcb */ } #endif /*IPSEC*/ #ifdef MAC if (policyfail == 0 && mac_check_socket_deliver(last->inp_socket, n) != 0) policyfail = 1; #endif } if (policyfail) m_freem(n); else if (n) { if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & SO_TIMESTAMP) ip_savecontrol(last, &opts, ip, n); if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&ripsrc, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) m_freem(opts); } else sorwakeup(last->inp_socket); opts = 0; } } last = inp; } if (last) { #ifdef IPSEC /* check AH/ESP integrity. */ if (ipsec4_in_reject_so(m, last->inp_socket)) { m_freem(m); ipsecstat.in_polvio++; ipstat.ips_delivered--; /* do not inject data to pcb */ return; } #endif /*IPSEC*/ #ifdef MAC if (mac_check_socket_deliver(last->inp_socket, m) != 0) { m_freem(m); ipstat.ips_delivered--; return; } #endif if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & SO_TIMESTAMP) ip_savecontrol(last, &opts, ip, m); if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&ripsrc, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); } else sorwakeup(last->inp_socket); } else { m_freem(m); ipstat.ips_noproto++; ipstat.ips_delivered--; } } /* * Generate IP header and pass packet to ip_output. * Tack on options user may have setup with control call. */ int rip_output(m, so, dst) struct mbuf *m; struct socket *so; u_long dst; { register struct ip *ip; register struct inpcb *inp = sotoinpcb(so); int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST; #ifdef MAC mac_create_mbuf_from_socket(so, m); #endif /* * If the user handed us a complete IP packet, use it. * Otherwise, allocate an mbuf for a header and fill it in. */ if ((inp->inp_flags & INP_HDRINCL) == 0) { if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } M_PREPEND(m, sizeof(struct ip), M_TRYWAIT); ip = mtod(m, struct ip *); ip->ip_tos = inp->inp_ip_tos; ip->ip_off = 0; ip->ip_p = inp->inp_ip_p; ip->ip_len = m->m_pkthdr.len; ip->ip_src = inp->inp_laddr; ip->ip_dst.s_addr = dst; ip->ip_ttl = inp->inp_ip_ttl; } else { if (m->m_pkthdr.len > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } ip = mtod(m, struct ip *); /* don't allow both user specified and setsockopt options, and don't allow packet length sizes that will crash */ if (((IP_VHL_HL(ip->ip_vhl) != (sizeof (*ip) >> 2)) && inp->inp_options) || (ip->ip_len > m->m_pkthdr.len) || (ip->ip_len < (IP_VHL_HL(ip->ip_vhl) << 2))) { m_freem(m); return EINVAL; } if (ip->ip_id == 0) #ifdef RANDOM_IP_ID ip->ip_id = ip_randomid(); #else ip->ip_id = htons(ip_id++); #endif /* XXX prevent ip_output from overwriting header fields */ flags |= IP_RAWOUTPUT; ipstat.ips_rawout++; } -#ifdef IPSEC - if (ipsec_setsocket(m, so) != 0) { - m_freem(m); - return ENOBUFS; - } -#endif /*IPSEC*/ - return (ip_output(m, inp->inp_options, &inp->inp_route, flags, - inp->inp_moptions)); + inp->inp_moptions, inp)); } /* * Raw IP socket option processing. */ int rip_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { struct inpcb *inp = sotoinpcb(so); int error, optval; if (sopt->sopt_level != IPPROTO_IP) return (EINVAL); error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case IP_HDRINCL: optval = inp->inp_flags & INP_HDRINCL; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_FW_ADD: /* ADD actually returns the body... */ case IP_FW_GET: if (IPFW_LOADED) error = ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET_GET: if (DUMMYNET_LOADED) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT; break ; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: error = ip_mrouter_get(so, sopt); break; default: error = ip_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case IP_HDRINCL: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; if (optval) inp->inp_flags |= INP_HDRINCL; else inp->inp_flags &= ~INP_HDRINCL; break; case IP_FW_ADD: case IP_FW_DEL: case IP_FW_FLUSH: case IP_FW_ZERO: case IP_FW_RESETLOG: if (IPFW_LOADED) error = ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: case IP_DUMMYNET_FLUSH: if (DUMMYNET_LOADED) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT ; break ; case IP_RSVP_ON: error = ip_rsvp_init(so); break; case IP_RSVP_OFF: error = ip_rsvp_done(); break; /* XXX - should be combined */ case IP_RSVP_VIF_ON: error = ip_rsvp_vif_init(so, sopt); break; case IP_RSVP_VIF_OFF: error = ip_rsvp_vif_done(so, sopt); break; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: error = ip_mrouter_set(so, sopt); break; default: error = ip_ctloutput(so, sopt); break; } break; } return (error); } /* * This function exists solely to receive the PRC_IFDOWN messages which * are sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, * and calls in_ifadown() to remove all routes corresponding to that address. * It also receives the PRC_IFUP messages from if_up() and reinstalls the * interface routes. */ void rip_ctlinput(cmd, sa, vip) int cmd; struct sockaddr *sa; void *vip; { struct in_ifaddr *ia; struct ifnet *ifp; int err; int flags; switch (cmd) { case PRC_IFDOWN: TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa && (ia->ia_flags & IFA_ROUTE)) { /* * in_ifscrub kills the interface route. */ in_ifscrub(ia->ia_ifp, ia); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 0); break; } } break; case PRC_IFUP: TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa) break; } if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) return; flags = RTF_UP; ifp = ia->ia_ifa.ifa_ifp; if ((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_flags & IFF_POINTOPOINT)) flags |= RTF_HOST; err = rtinit(&ia->ia_ifa, RTM_ADD, flags); if (err == 0) ia->ia_flags |= IFA_ROUTE; break; } } u_long rip_sendspace = RIPSNDQ; u_long rip_recvspace = RIPRCVQ; SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, &rip_recvspace, 0, "Maximum incoming raw IP datagram size"); static int rip_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error, s; inp = sotoinpcb(so); if (inp) panic("rip_attach"); if (td && (error = suser(td)) != 0) return error; error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return error; s = splnet(); error = in_pcballoc(so, &ripcbinfo, td); splx(s); if (error) return error; inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; inp->inp_ip_ttl = ip_defttl; return 0; } static int rip_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); if (inp == 0) panic("rip_detach"); if (so == ip_mrouter) ip_mrouter_done(); ip_rsvp_force_done(so); if (so == ip_rsvpd) ip_rsvp_done(); in_pcbdetach(inp); return 0; } static int rip_abort(struct socket *so) { soisdisconnected(so); return rip_detach(so); } static int rip_disconnect(struct socket *so) { if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN; return rip_abort(so); } static int rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in *addr = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof(*addr)) return EINVAL; if (TAILQ_EMPTY(&ifnet) || ((addr->sin_family != AF_INET) && (addr->sin_family != AF_IMPLINK)) || (addr->sin_addr.s_addr && ifa_ifwithaddr((struct sockaddr *)addr) == 0)) return EADDRNOTAVAIL; inp->inp_laddr = addr->sin_addr; return 0; } static int rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in *addr = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof(*addr)) return EINVAL; if (TAILQ_EMPTY(&ifnet)) return EADDRNOTAVAIL; if ((addr->sin_family != AF_INET) && (addr->sin_family != AF_IMPLINK)) return EAFNOSUPPORT; inp->inp_faddr = addr->sin_addr; soisconnected(so); return 0; } static int rip_shutdown(struct socket *so) { socantsendmore(so); return 0; } static int rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp = sotoinpcb(so); register u_long dst; if (so->so_state & SS_ISCONNECTED) { if (nam) { m_freem(m); return EISCONN; } dst = inp->inp_faddr.s_addr; } else { if (nam == NULL) { m_freem(m); return ENOTCONN; } dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; } return rip_output(m, so, dst); } static int rip_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n, s; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = ripcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ s = splnet(); gencnt = ripcbinfo.ipi_gencnt; n = ripcbinfo.ipi_count; splx(s); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return ENOMEM; s = splnet(); for (inp = LIST_FIRST(ripcbinfo.listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { if (inp->inp_gencnt <= gencnt) { if (cr_canseesocket(req->td->td_ucred, inp->inp_socket)) continue; inp_list[i++] = inp; } } splx(s); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); error = SYSCTL_OUT(req, &xi, sizeof xi); } } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ s = splnet(); xig.xig_gen = ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = ripcbinfo.ipi_count; splx(s); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } /* * This is the wrapper function for in_setsockaddr. We just pass down * the pcbinfo for in_setpeeraddr to lock. */ static int rip_sockaddr(struct socket *so, struct sockaddr **nam) { return (in_setsockaddr(so, nam, &ripcbinfo)); } /* * This is the wrapper function for in_setpeeraddr. We just pass down * the pcbinfo for in_setpeeraddr to lock. */ static int rip_peeraddr(struct socket *so, struct sockaddr **nam) { return (in_setpeeraddr(so, nam, &ripcbinfo)); } SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0, rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); struct pr_usrreqs rip_usrreqs = { rip_abort, pru_accept_notsupp, rip_attach, rip_bind, rip_connect, pru_connect2_notsupp, in_control, rip_detach, rip_disconnect, pru_listen_notsupp, rip_peeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, rip_send, pru_sense_null, rip_shutdown, rip_sockaddr, sosend, soreceive, sopoll }; Index: head/sys/netinet/tcp_input.c =================================================================== --- head/sys/netinet/tcp_input.c (revision 105193) +++ head/sys/netinet/tcp_input.c (revision 105194) @@ -1,2790 +1,2790 @@ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 * $FreeBSD$ */ #include "opt_ipfw.h" /* for ipfw_fwd */ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include "opt_tcp_input.h" #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #include #include #include /* for ICMP_BANDLIM */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef IPSEC #include #include #include #endif /*IPSEC*/ #include MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry"); static const int tcprexmtthresh = 3; tcp_cc tcp_ccgen; struct tcpstat tcpstat; SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); static int log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &log_in_vain, 0, "Log all incoming TCP connections"); static int blackhole = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, &blackhole, 0, "Do not send RST when dropping refused connections"); int tcp_delack_enabled = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, &tcp_delack_enabled, 0, "Delay ACK to try and piggyback it onto a data packet"); #ifdef TCP_DROP_SYNFIN static int drop_synfin = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); #endif struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; struct mtx *tcbinfo_mtx; static void tcp_dooptions(struct tcpopt *, u_char *, int, int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 #define ND6_HINT(tp) \ do { \ if ((tp) && (tp)->t_inpcb && \ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \ (tp)->t_inpcb->in6p_route.ro_rt) \ nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \ } while (0) #else #define ND6_HINT(tp) #endif /* * Indicate whether this ack should be delayed. We can delay the ack if * - delayed acks are enabled and * - there is no delayed ack timer in progress and * - our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. */ #define DELAY_ACK(tp) \ (tcp_delack_enabled && !callout_pending(tp->tt_delack) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) static int tcp_reass(tp, th, tlenp, m) register struct tcpcb *tp; register struct tcphdr *th; int *tlenp; struct mbuf *m; { struct tseg_qent *q; struct tseg_qent *p = NULL; struct tseg_qent *nq; struct tseg_qent *te; struct socket *so = tp->t_inpcb->inp_socket; int flags; /* * Call with th==0 after become established to * force pre-ESTABLISHED data up to user socket. */ if (th == 0) goto present; /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ, M_NOWAIT); if (te == NULL) { tcpstat.tcps_rcvmemdrop++; m_freem(m); return (0); } /* * Find a segment which begins after this one does. */ LIST_FOREACH(q, &tp->t_segq, tqe_q) { if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) break; p = q; } /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us. */ if (p != NULL) { register int i; /* conversion to int (in i) handles seq wraparound */ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; if (i > 0) { if (i >= *tlenp) { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); FREE(te, M_TSEGQ); /* * Try to present any queued data * at the left window edge to the user. * This is needed after the 3-WHS * completes. */ goto present; /* ??? */ } m_adj(m, i); *tlenp -= i; th->th_seq += i; } } tcpstat.tcps_rcvoopack++; tcpstat.tcps_rcvoobyte += *tlenp; /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ while (q) { register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; if (i <= 0) break; if (i < q->tqe_len) { q->tqe_th->th_seq += i; q->tqe_len -= i; m_adj(q->tqe_m, i); break; } nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); FREE(q, M_TSEGQ); q = nq; } /* Insert the new segment queue entry into place. */ te->tqe_m = m; te->tqe_th = th; te->tqe_len = *tlenp; if (p == NULL) { LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); } else { LIST_INSERT_AFTER(p, te, tqe_q); } present: /* * Present data to user, advancing rcv_nxt through * completed sequence space. */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); q = LIST_FIRST(&tp->t_segq); if (!q || q->tqe_th->th_seq != tp->rcv_nxt) return (0); do { tp->rcv_nxt += q->tqe_len; flags = q->tqe_th->th_flags & TH_FIN; nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); if (so->so_state & SS_CANTRCVMORE) m_freem(q->tqe_m); else sbappend(&so->so_rcv, q->tqe_m); FREE(q, M_TSEGQ); q = nq; } while (q && q->tqe_th->th_seq == tp->rcv_nxt); ND6_HINT(tp); sorwakeup(so); return (flags); } /* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ #ifdef INET6 int tcp6_input(mp, offp, proto) struct mbuf **mp; int *offp, proto; { register struct mbuf *m = *mp; struct in6_ifaddr *ia6; IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ia6 = ip6_getdstifaddr(m); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); return IPPROTO_DONE; } tcp_input(m, *offp); return IPPROTO_DONE; } #endif void tcp_input(m, off0) register struct mbuf *m; int off0; { register struct tcphdr *th; register struct ip *ip = NULL; register struct ipovly *ipov; register struct inpcb *inp = NULL; u_char *optp = NULL; int optlen = 0; int len, tlen, off; int drop_hdrlen; register struct tcpcb *tp = 0; register int thflags; struct socket *so = 0; int todrop, acked, ourfinisacked, needoutput = 0; u_long tiwin; struct tcpopt to; /* options in this segment */ struct rmxp_tao *taop; /* pointer to our TAO cache entry */ struct rmxp_tao tao_noncached; /* in case there's no cached entry */ int headlocked = 0; struct sockaddr_in *next_hop = NULL; int rstreason; /* For badport_bandlim accounting purposes */ struct ip6_hdr *ip6 = NULL; #ifdef INET6 int isipv6; #else const int isipv6 = 0; #endif #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[40]; struct tcphdr tcp_savetcp; short ostate = 0; #endif #ifdef MAC int error; #endif /* Grab info from MT_TAG mbufs prepended to the chain. */ for (;m && m->m_type == MT_TAG; m = m->m_next) { - if (m->m_tag_id == PACKET_TAG_IPFORWARD) + if (m->_m_tag_id == PACKET_TAG_IPFORWARD) next_hop = (struct sockaddr_in *)m->m_hdr.mh_data; } #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif bzero((char *)&to, sizeof(to)); tcpstat.tcps_rcvtotal++; if (isipv6) { /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { tcpstat.tcps_rcvbadsum++; goto drop; } th = (struct tcphdr *)((caddr_t)ip6 + off0); /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } } else { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { tcpstat.tcps_rcvshort++; return; } } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ip->ip_len; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + ip->ip_len + IPPROTO_TCP)); th->th_sum ^= 0xffff; } else { /* * Checksum extended TCP header and data. */ len = sizeof (struct ip) + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = (u_short)tlen; ipov->ih_len = htons(ipov->ih_len); th->th_sum = in_cksum(m, len); } if (th->th_sum) { tcpstat.tcps_rcvbadsum++; goto drop; } #ifdef INET6 /* Re-initialization for later version check */ ip->ip_v = IPVERSION; #endif } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { tcpstat.tcps_rcvbadoff++; goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { if (isipv6) { IP6_EXTHDR_CHECK(m, off0, off, ); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); } else { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { tcpstat.tcps_rcvshort++; return; } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); } } optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = th->th_flags; #ifdef TCP_DROP_SYNFIN /* * If the drop_synfin option is enabled, drop all packets with * both the SYN and FIN bits set. This prevents e.g. nmap from * identifying the TCP/IP stack. * * This is a violation of the TCP specification. */ if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) goto drop; #endif /* * Convert TCP protocol specific fields to host format. */ th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); /* * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options, * until after ip6_savecontrol() is called and before other functions * which don't want those proto headers. * Because ip6_savecontrol() is going to parse the mbuf to * search for data to be passed up to user-land, it wants mbuf * parameters to be unchanged. * XXX: the call of ip6_savecontrol() has been obsoleted based on * latest version of the advanced API (20020110). */ drop_hdrlen = off0 + off; /* * Locate pcb for segment. */ INP_INFO_WLOCK(&tcbinfo); headlocked = 1; findpcb: /* IPFIREWALL_FORWARD section */ if (next_hop != NULL && isipv6 == 0) { /* IPv6 support is not yet */ /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); if (!inp) { /* It's new. Try find the ambushing socket. */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, 1, m->m_pkthdr.rcvif); } } else { if (isipv6) inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, 1, m->m_pkthdr.rcvif); else inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); } #ifdef IPSEC if (isipv6) { if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) { ipsec6stat.in_polvio++; goto drop; } } else { if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) { ipsecstat.in_polvio++; goto drop; } } #endif /* * If the state is CLOSED (i.e., TCB does not exist) then * all data in the incoming segment is discarded. * If the TCB exists but is in CLOSED state, it is embryonic, * but should either do a listen or a connect soon. */ if (inp == NULL) { if (log_in_vain) { #ifdef INET6 char dbuf[INET6_ADDRSTRLEN+2], sbuf[INET6_ADDRSTRLEN+2]; #else char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; #endif if (isipv6) { strcpy(dbuf, "["); strcpy(sbuf, "["); strcat(dbuf, ip6_sprintf(&ip6->ip6_dst)); strcat(sbuf, ip6_sprintf(&ip6->ip6_src)); strcat(dbuf, "]"); strcat(sbuf, "]"); } else { strcpy(dbuf, inet_ntoa(ip->ip_dst)); strcpy(sbuf, inet_ntoa(ip->ip_src)); } switch (log_in_vain) { case 1: if (thflags & TH_SYN) log(LOG_INFO, "Connection attempt to TCP %s:%d " "from %s:%d\n", dbuf, ntohs(th->th_dport), sbuf, ntohs(th->th_sport)); break; case 2: log(LOG_INFO, "Connection attempt to TCP %s:%d " "from %s:%d flags:0x%x\n", dbuf, ntohs(th->th_dport), sbuf, ntohs(th->th_sport), thflags); break; default: break; } } if (blackhole) { switch (blackhole) { case 1: if (thflags & TH_SYN) goto drop; break; case 2: goto drop; default: goto drop; } } rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_LOCK(inp); tp = intotcpcb(inp); if (tp == 0) { INP_UNLOCK(inp); rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } if (tp->t_state == TCPS_CLOSED) goto drop; /* Unscale the window into a 32-bit value. */ if ((thflags & TH_SYN) == 0) tiwin = th->th_win << tp->snd_scale; else tiwin = th->th_win; so = inp->inp_socket; #ifdef MAC error = mac_check_socket_deliver(so, m); if (error) goto drop; #endif if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { struct in_conninfo inc; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; if (isipv6) bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); else bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif /* skip if this isn't a listen socket */ if ((so->so_options & SO_ACCEPTCONN) == 0) goto after_listen; #ifdef INET6 inc.inc_isipv6 = isipv6; #endif if (isipv6) { inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; inc.inc6_route.ro_rt = NULL; /* XXX */ } else { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; inc.inc_route.ro_rt = NULL; /* XXX */ } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; /* * If the state is LISTEN then ignore segment if it contains * a RST. If the segment contains an ACK then it is bad and * send a RST. If it does not contain a SYN then it is not * interesting; drop it. * * If the state is SYN_RECEIVED (syncache) and seg contains * an ACK, but not for our SYN/ACK, send a RST. If the seg * contains a RST, check the sequence number to see if it * is a valid reset segment. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { if (!syncache_expand(&inc, th, &so, m)) { /* * No syncache entry, or ACK was not * for our SYN/ACK. Send a RST. */ tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (so == NULL) { /* * Could not complete 3-way handshake, * connection is being closed down, and * syncache will free mbuf. */ INP_UNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return; } /* * Socket is created in state SYN_RECEIVED. * Continue processing segment. */ INP_UNLOCK(inp); inp = sotoinpcb(so); INP_LOCK(inp); tp = intotcpcb(inp); /* * This is what would have happened in * tcp_output() when the SYN,ACK was sent. */ tp->snd_up = tp->snd_una; tp->snd_max = tp->snd_nxt = tp->iss + 1; tp->last_ack_sent = tp->rcv_nxt; /* * XXX possible bug - it doesn't appear that tp->snd_wnd is unscaled * until the _second_ ACK is received: * rcv SYN (set wscale opts) --> send SYN/ACK, set snd_wnd = window. * rcv ACK, calculate tiwin --> process SYN_RECEIVED, determine wscale, * move to ESTAB, set snd_wnd to tiwin. */ tp->snd_wnd = tiwin; /* unscaled */ goto after_listen; } if (thflags & TH_RST) { syncache_chkrst(&inc, th); goto drop; } if (thflags & TH_ACK) { syncache_badack(&inc); tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } goto drop; } /* * Segment's flags are (SYN) or (SYN|FIN). */ #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !ip6_use_deprecated) { struct in6_ifaddr *ia6; if ((ia6 = ip6_getdstifaddr(m)) && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { INP_UNLOCK(inp); tp = NULL; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } } #endif /* * If it is from this socket, drop it, it must be forged. * Don't bother responding if the destination was a broadcast. */ if (th->th_dport == th->th_sport) { if (isipv6) { if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) goto drop; } else { if (ip->ip_dst.s_addr == ip->ip_src.s_addr) goto drop; } } /* * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN * * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } /* * SYN appears to be valid; create compressed TCP state * for syncache, or perform t/tcp connection. */ if (so->so_qlen <= so->so_qlimit) { tcp_dooptions(&to, optp, optlen, 1); if (!syncache_add(&inc, &to, th, &so, m)) goto drop; if (so == NULL) { /* * Entry added to syncache, mbuf used to * send SYN,ACK packet. */ KASSERT(headlocked, ("headlocked")); INP_UNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return; } /* * Segment passed TAO tests. */ INP_UNLOCK(inp); inp = sotoinpcb(so); INP_LOCK(inp); tp = intotcpcb(inp); tp->snd_wnd = tiwin; tp->t_starttime = ticks; tp->t_state = TCPS_ESTABLISHED; /* * If there is a FIN, or if there is data and the * connection is local, then delay SYN,ACK(SYN) in * the hope of piggy-backing it on a response * segment. Otherwise must send ACK now in case * the other side is slow starting. */ if (DELAY_ACK(tp) && ((thflags & TH_FIN) || (tlen != 0 && ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || (!isipv6 && in_localaddr(inp->inp_faddr)))))) { callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); tp->t_flags |= TF_NEEDSYN; } else tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcpstat.tcps_connects++; soisconnected(so); goto trimthenstep6; } goto drop; } after_listen: /* XXX temp debugging */ /* should not happen - syncache should pick up these connections */ if (tp->t_state == TCPS_LISTEN) panic("tcp_input: TCPS_LISTEN"); /* * Segment received on connection. * Reset idle time and keep-alive timer. */ tp->t_rcvtime = ticks; if (TCPS_HAVEESTABLISHED(tp->t_state)) callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); /* * Process options. * XXX this is tradtitional behavior, may need to be cleaned up. */ tcp_dooptions(&to, optp, optlen, thflags & TH_SYN); if (thflags & TH_SYN) { if (to.to_flags & TOF_SCALE) { tp->t_flags |= TF_RCVD_SCALE; tp->requested_s_scale = to.to_requested_s_scale; } if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = ticks; } if (to.to_flags & (TOF_CC|TOF_CCNEW)) tp->t_flags |= TF_RCVD_CC; if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED above, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && /* * Using the CC option is compulsory if once started: * the segment is OK if no T/TCP was negotiated or * if the segment has a CC option equal to CCrecv */ ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || ((to.to_flags & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) && th->th_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && tp->t_dupacks < tcprexmtthresh) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; /* * this is a pure ack for outstanding data. */ ++tcpstat.tcps_predack; /* * "bad retransmit" recovery */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); tp->snd_una = th->th_ack; tp->t_dupacks = 0; m_freem(m); ND6_HINT(tp); /* some progress has been done */ /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ if (tp->snd_una == tp->snd_max) callout_stop(tp->tt_rexmt); else if (!callout_active(tp->tt_persist)) callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); sowwakeup(so); if (so->so_snd.sb_cc) (void) tcp_output(tp); INP_UNLOCK(inp); return; } } else if (th->th_ack == tp->snd_una && LIST_EMPTY(&tp->t_segq) && tlen <= sbspace(&so->so_rcv)) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; /* * this is a pure, in-sequence data packet * with nothing on the reassembly queue and * we have enough buffer space to take it. */ ++tcpstat.tcps_preddat; tp->rcv_nxt += tlen; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); /* some progress has been done */ /* * Add data to socket buffer. */ if (so->so_state & SS_CANTRCVMORE) { m_freem(m); } else { m_adj(m, drop_hdrlen); /* delayed header drop */ sbappend(&so->so_rcv, m); } sorwakeup(so); if (DELAY_ACK(tp)) { callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } INP_UNLOCK(inp); return; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ { int win; win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } break; /* * If the state is SYN_SENT: * if seg contains an ACK, but not for our SYN, drop the input. * if seg contains a RST, then drop the connection. * if seg does not contain SYN, then drop it. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) { taop = &tao_noncached; bzero(taop, sizeof(*taop)); } if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { /* * If we have a cached CCsent for the remote host, * hence we haven't just crashed and restarted, * do not send a RST. This may be a retransmission * from the other side after our earlier ACK was lost. * Our new SYN, when it arrives, will serve as the * needed ACK. */ if (taop->tao_ccsent != 0) goto drop; else { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } } if (thflags & TH_RST) { if (thflags & TH_ACK) tp = tcp_drop(tp, ECONNREFUSED); goto drop; } if ((thflags & TH_SYN) == 0) goto drop; tp->snd_wnd = th->th_win; /* initial send window */ tp->cc_recv = to.to_cc; /* foreign CC */ tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { /* * Our SYN was acked. If segment contains CC.ECHO * option, check it to make sure this segment really * matches our SYN. If not, just drop it as old * duplicate, but send an RST if we're still playing * by the old rules. If no CC.ECHO option, make sure * we don't get fooled into using T/TCP. */ if (to.to_flags & TOF_CCECHO) { if (tp->cc_send != to.to_ccecho) { if (taop->tao_ccsent != 0) goto drop; else { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } } } else tp->t_flags &= ~TF_RCVD_CC; tcpstat.tcps_connects++; soisconnected(so); #ifdef MAC mac_set_socket_peer_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } /* Segment is acceptable, update cache if undefined. */ if (taop->tao_ccsent == 0) taop->tao_ccsent = to.to_ccecho; tp->rcv_adv += tp->rcv_wnd; tp->snd_una++; /* SYN is acked */ /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp) && tlen != 0) callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); else tp->t_flags |= TF_ACKNOW; /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. If segment contains CC option * and there is a cached CC, apply TAO test. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* * If there was no CC option, clear cached CC value. */ tp->t_flags |= TF_ACKNOW; callout_stop(tp->tt_rexmt); if (to.to_flags & TOF_CC) { if (taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) { /* * update cache and make transition: * SYN-SENT -> ESTABLISHED* * SYN-SENT* -> FIN-WAIT-1* */ taop->tao_cc = to.to_cc; tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } tp->t_flags |= TF_NEEDSYN; } else tp->t_state = TCPS_SYN_RECEIVED; } else { /* CC.NEW or no option => invalidate cache */ taop->tao_cc = 0; tp->t_state = TCPS_SYN_RECEIVED; } } trimthenstep6: /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; tcpstat.tcps_rcvpackafterwin++; tcpstat.tcps_rcvbyteafterwin += todrop; } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * if segment contains a SYN and CC [not CC.NEW] option: * if state == TIME_WAIT and connection duration > MSL, * drop packet and send RST; * * if SEG.CC > CCrecv then is new SYN, and can implicitly * ack the FIN (and data) in retransmission queue. * Complete close and delete TCPCB. Then reprocess * segment, hoping to find new TCPCB in LISTEN state; * * else must be old SYN; drop it. * else do normal processing. */ case TCPS_LAST_ACK: case TCPS_CLOSING: case TCPS_TIME_WAIT: if ((thflags & TH_SYN) && (to.to_flags & TOF_CC) && tp->cc_recv != 0) { if (tp->t_state == TCPS_TIME_WAIT && (ticks - tp->t_starttime) > tcp_msl) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } if (CC_GT(to.to_cc, tp->cc_recv)) { tp = tcp_close(tp); goto findpcb; } else goto drop; } break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. * * * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * Note: this does not take into account delayed ACKs, so * we should test against last_ack_sent instead of rcv_nxt. * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. * If we have multiple segments in flight, the intial reset * segment sequence numbers will be to the left of last_ack_sent, * but they will eventually catch up. * In any case, it never made sense to trim reset segments to * fit the receive window since RFC 1122 says: * 4.2.2.12 RST Segment: RFC-793 Section 3.4 * * A TCP SHOULD allow a received RST segment to include data. * * DISCUSSION * It has been suggested that a RST segment could contain * ASCII text that encoded and explained the cause of the * RST. No standard has yet been established for such * data. * * If the reset segment passes the sequence number test examine * the state: * SYN_RECEIVED STATE: * If passive open, return to LISTEN state. * If active open, inform user that connection was refused. * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: * Inform user that connection was reset, and close tcb. * CLOSING, LAST_ACK STATES: * Close the tcb. * TIME_WAIT STATE: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) { if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: tp->t_state = TCPS_CLOSED; tcpstat.tcps_drops++; tp = tcp_close(tp); break; case TCPS_CLOSING: case TCPS_LAST_ACK: tp = tcp_close(tp); break; case TCPS_TIME_WAIT: break; } } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += tlen; tcpstat.tcps_pawsdrop++; goto dropafterack; } } /* * T/TCP mechanism * If T/TCP was negotiated and the segment doesn't have CC, * or if its CC is wrong then drop the segment. * RST segments do not have to comply with this. */ if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) && ((to.to_flags & TOF_CC) == 0 || tp->cc_recv != to.to_cc)) goto dropafterack; /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += todrop; } else { tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += todrop; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { tp = tcp_close(tp); tcpstat.tcps_rcvafterclose++; rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { tcpstat.tcps_rcvpackafterwin++; if (todrop >= tlen) { tcpstat.tcps_rcvbyteafterwin += tlen; /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if (thflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && SEQ_GT(th->th_seq, tp->rcv_nxt)) { tp = tcp_close(tp); goto findpcb; } /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; } else tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } /* * If a SYN is in the window, then this is an * error and we send an RST and drop the connection. */ if (thflags & TH_SYN) { tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: tcpstat.tcps_connects++; soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } /* * Upon successful completion of 3-way handshake, * update cache.CC if it was undefined, pass any queued * data to the user, and advance state appropriately. */ if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL && taop->tao_cc == 0) taop->tao_cc = tp->cc_recv; /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { tcpstat.tcps_rcvdupack++; /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change), the ack is the biggest we've * seen and we've seen exactly our rexmt * threshhold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. */ if (!callout_active(tp->tt_rexmt) || th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (tcp_do_newreno && SEQ_LT(th->th_ack, tp->snd_recover)) { /* False retransmit, should not * cut window */ tp->snd_cwnd += tp->t_maxseg; tp->t_dupacks = 0; (void) tcp_output(tp); goto drop; } if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; tp->snd_recover = tp->snd_max; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tp->t_dupacks > tcprexmtthresh) { tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } } else tp->t_dupacks = 0; break; } /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (tcp_do_newreno) { int is_partialack = SEQ_LT(th->th_ack, tp->snd_recover); if (tp->t_dupacks >= tcprexmtthresh) { if (is_partialack) { tcp_newreno_partial_ack(tp, th); } else { /* * Window inflation should have left us * with approximately snd_ssthresh * outstanding data. * But in case we would be inclined to * send a burst, better to do it via * the slow start mechanism. */ if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) tp->snd_cwnd = tp->snd_max - th->th_ack + tp->t_maxseg; else tp->snd_cwnd = tp->snd_ssthresh; } } /* * Reset dupacks, except on partial acks in * fast recovery. */ if (!(tp->t_dupacks >= tcprexmtthresh && is_partialack)) tp->t_dupacks = 0; } else { if (tp->t_dupacks >= tcprexmtthresh && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_dupacks = 0; } if (SEQ_GT(th->th_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } } process_ACK: acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; /* XXX probably not required */ } /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { callout_stop(tp->tt_rexmt); needoutput = 1; } else if (!callout_active(tp->tt_persist)) callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * When new data is acked, open the congestion window. * If the window gives us less than ssthresh packets * in flight, open exponentially (maxseg per packet). * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ { register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; /* * If t_dupacks != 0 here, it indicates that we are still * in NewReno fast recovery mode, so we leave the congestion * window alone. */ if (!tcp_do_newreno || tp->t_dupacks == 0) tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<snd_scale); } if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); ourfinisacked = 1; } else { sbdrop(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } sowwakeup(so); tp->snd_una = th->th_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. */ if (so->so_state & SS_CANTRCVMORE) { soisdisconnected(so); callout_reset(tp->tt_2msl, tcp_maxidle, tcp_timer_2msl, tp); } tp->t_state = TCPS_FIN_WAIT_2; } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); /* Shorten TIME_WAIT [RFC-1644, p.28] */ if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) callout_reset(tp->tt_2msl, tp->t_rxtcur * TCPTV_TWTRUNC, tcp_timer_2msl, tp); else callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); soisdisconnected(so); } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { tp = tcp_close(tp); goto drop; } break; /* * In TIME_WAIT state the only thing that should arrive * is a retransmission of the remote FIN. Acknowledge * it and restart the finack timer. */ case TCPS_TIME_WAIT: callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); goto dropafterack; } } step6: /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ if (th->th_urp + so->so_rcv.sb_cc > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_state |= SS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (u_long)tlen #ifdef SO_OOBINLINE && (so->so_options & SO_OOBINLINE) == 0 #endif ) tcp_pulloutofband(so, th, m, drop_hdrlen); /* hdr drop is delayed */ } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp)) callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); if (so->so_state & SS_CANTRCVMORE) m_freem(m); else sbappend(&so->so_rcv, m); sorwakeup(so); } else { thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. */ len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN)) callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /*FALLTHROUGH*/ case TCPS_ESTABLISHED: tp->t_state = TCPS_CLOSE_WAIT; break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); /* Shorten TIME_WAIT [RFC-1644, p.28] */ if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) { callout_reset(tp->tt_2msl, tp->t_rxtcur * TCPTV_TWTRUNC, tcp_timer_2msl, tp); /* For transaction client, force ACK now. */ tp->t_flags |= TF_ACKNOW; } else callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); soisdisconnected(so); break; /* * In TIME_WAIT state restart the 2 MSL time_wait timer. */ case TCPS_TIME_WAIT: callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); break; } } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tcp_output(tp); INP_UNLOCK(inp); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); m_freem(m); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); INP_UNLOCK(inp); return; dropwithreset: /* * Generate a RST, dropping incoming segment. * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } /* IPv6 anycast check is done at tcp6_input() */ /* * Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; #ifdef TCPDEBUG if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp) INP_UNLOCK(inp); if (thflags & TH_ACK) /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); else { if (thflags & TH_SYN) tlen++; /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); return; drop: /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp) INP_UNLOCK(inp); m_freem(m); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); return; } /* * Parse TCP options and place in tcpopt. */ static void tcp_dooptions(to, cp, cnt, is_syn) struct tcpopt *to; u_char *cp; int cnt; { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!is_syn) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (! is_syn) continue; to->to_flags |= TOF_SCALE; to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; case TCPOPT_CC: if (optlen != TCPOLEN_CC) continue; to->to_flags |= TOF_CC; bcopy((char *)cp + 2, (char *)&to->to_cc, sizeof(to->to_cc)); to->to_cc = ntohl(to->to_cc); break; case TCPOPT_CCNEW: if (optlen != TCPOLEN_CC) continue; if (!is_syn) continue; to->to_flags |= TOF_CCNEW; bcopy((char *)cp + 2, (char *)&to->to_cc, sizeof(to->to_cc)); to->to_cc = ntohl(to->to_cc); break; case TCPOPT_CCECHO: if (optlen != TCPOLEN_CC) continue; if (!is_syn) continue; to->to_flags |= TOF_CCECHO; bcopy((char *)cp + 2, (char *)&to->to_ccecho, sizeof(to->to_ccecho)); to->to_ccecho = ntohl(to->to_ccecho); break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ static void tcp_pulloutofband(so, th, m, off) struct socket *so; struct tcphdr *th; register struct mbuf *m; int off; /* delayed to be droped hdrlen */ { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == 0) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_xmit_timer(tp, rtt) register struct tcpcb *tp; int rtt; { register int delta; tcpstat.tcps_rttupdated++; tp->t_rttupdated++; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing * interface without forcing IP to fragment; if bigger than * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES * to utilize large mbufs. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * Also take into account the space needed for options that we * send regularly. Make maxseg shorter by that amount to assure * that we can send maxseg amount of data even when the options * are present. Store the upper limit of the length of options plus * data in maxopd. * * NOTE that this routine is only called when we process an incoming * segment, for outgoing segments only tcp_mssopt is called. * * In case of T/TCP, we call this routine during implicit connection * setup as well (offer = -1), to initialize maxseg from the cached * MSS of our peer. */ void tcp_mss(tp, offer) struct tcpcb *tp; int offer; { register struct rtentry *rt; struct ifnet *ifp; register int rtt, mss; u_long bufsize; struct inpcb *inp = tp->t_inpcb; struct socket *so; struct rmxp_tao *taop; int origoffer = offer; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const int isipv6 = 0; const size_t min_protoh = sizeof (struct tcpiphdr); #endif if (isipv6) rt = tcp_rtlookup6(&inp->inp_inc); else rt = tcp_rtlookup(&inp->inp_inc); if (rt == NULL) { tp->t_maxopd = tp->t_maxseg = isipv6 ? tcp_v6mssdflt : tcp_mssdflt; return; } ifp = rt->rt_ifp; so = inp->inp_socket; taop = rmx_taop(rt->rt_rmx); /* * Offer == -1 means that we didn't receive SYN yet, * use cached value in that case; */ if (offer == -1) offer = taop->tao_mssopt; /* * Offer == 0 means that there was no MSS on the SYN segment, * in this case we use tcp_mssdflt. */ if (offer == 0) offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt; else /* * Sanity check: make sure that maxopd will be large * enough to allow some data on segments even is the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. */ offer = max(offer, 64); taop->tao_mssopt = offer; /* * While we're here, check if there's an initial rtt * or rttvar. Convert from the route-table units * to scaled multiples of the slow timeout timer. */ if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { /* * XXX the lock bit for RTT indicates that the value * is also a minimum value; this is subject to time. */ if (rt->rt_rmx.rmx_locks & RTV_RTT) tp->t_rttmin = rtt / (RTM_RTTUNIT / hz); tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; tcpstat.tcps_usedrtt++; if (rt->rt_rmx.rmx_rttvar) { tp->t_rttvar = rt->rt_rmx.rmx_rttvar / (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); tcpstat.tcps_usedrttvar++; } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } /* * if there's an mtu associated with the route, use it * else, use the link mtu. */ if (rt->rt_rmx.rmx_mtu) mss = rt->rt_rmx.rmx_mtu - min_protoh; else { if (isipv6) { mss = nd_ifinfo[rt->rt_ifp->if_index].linkmtu - min_protoh; if (!in6_localaddr(&inp->in6p_faddr)) mss = min(mss, tcp_v6mssdflt); } else { mss = ifp->if_mtu - min_protoh; if (!in_localaddr(inp->inp_faddr)) mss = min(mss, tcp_mssdflt); } } mss = min(mss, offer); /* * maxopd stores the maximum length of data AND options * in a segment; maxseg is the amount of data in a normal * segment. We need to store this value (maxopd) apart * from maxseg, because now every segment carries options * and thus we normally have somewhat less data in segments. */ tp->t_maxopd = mss; /* * In case of T/TCP, origoffer==-1 indicates, that no segments * were received yet. In this case we just guess, otherwise * we do the same as before T/TCP. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (origoffer == -1 || (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) mss -= TCPOLEN_TSTAMP_APPA; if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && (origoffer == -1 || (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) mss -= TCPOLEN_CC_APPA; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif /* * If there's a pipesize, change the socket buffer * to that size. Make the socket buffers an integral * number of mss units; if the mss is larger than * the socket buffer, decrease the mss. */ #ifdef RTV_SPIPE if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) #endif bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve(&so->so_snd, bufsize, so, NULL); } tp->t_maxseg = mss; #ifdef RTV_RPIPE if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) #endif bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve(&so->so_rcv, bufsize, so, NULL); } /* * Set the slow-start flight size depending on whether this * is a local network or not. */ if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || (!isipv6 && in_localaddr(inp->inp_faddr))) tp->snd_cwnd = mss * ss_fltsz_local; else tp->snd_cwnd = mss * ss_fltsz; if (rt->rt_rmx.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshhold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); tcpstat.tcps_usedssthresh++; } } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(tp) struct tcpcb *tp; { struct rtentry *rt; #ifdef INET6 int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const int isipv6 = 0; const size_t min_protoh = sizeof (struct tcpiphdr); #endif if (isipv6) rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc); else rt = tcp_rtlookup(&tp->t_inpcb->inp_inc); if (rt == NULL) return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt); return (rt->rt_ifp->if_mtu - min_protoh); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ static void tcp_newreno_partial_ack(tp, th) struct tcpcb *tp; struct tcphdr *th; { tcp_seq onxt = tp->snd_nxt; u_long ocwnd = tp->snd_cwnd; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); } Index: head/sys/netinet/tcp_output.c =================================================================== --- head/sys/netinet/tcp_output.c (revision 105193) +++ head/sys/netinet/tcp_output.c (revision 105194) @@ -1,1010 +1,1001 @@ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 * $FreeBSD$ */ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #define TCPOUTFLAGS #include #include #include #include #include #ifdef TCPDEBUG #include #endif #ifdef IPSEC #include #endif /*IPSEC*/ #include #ifdef notyet extern struct mbuf *m_copypack(); #endif int path_mtu_discovery = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, &path_mtu_discovery, 1, "Enable Path MTU Discovery"); int ss_fltsz = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, &ss_fltsz, 1, "Slow start flight size"); int ss_fltsz_local = 4; SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, &ss_fltsz_local, 1, "Slow start flight size for local networks"); int tcp_do_newreno = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, 0, "Enable NewReno Algorithms"); /* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; long len, win; int off, flags, error; struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; int idle, sendalot; #if 0 int maxburst = TCP_MAXBURST; #endif struct rmxp_tao *taop; struct rmxp_tao tao_noncached; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif #ifndef INET6 mtx_assert(&tp->t_inpcb->inp_mtx, MA_OWNED); #endif /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- * slow start to get ack "clock" running again. * * Set the slow-start flight size depending on whether * this is a local network or not. */ int ss = ss_fltsz; #ifdef INET6 if (isipv6) { if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) ss = ss_fltsz_local; } else #endif /* INET6 */ if (in_localaddr(tp->t_inpcb->inp_faddr)) ss = ss_fltsz_local; tp->snd_cwnd = tp->t_maxseg * ss; } tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: sendalot = 0; off = tp->snd_nxt - tp->snd_una; win = min(tp->snd_wnd, tp->snd_cwnd); win = min(win, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_force) { if (win == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.sb_cc) flags &= ~TH_FIN; win = 1; } else { callout_stop(tp->tt_persist); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.sb_cc is 0, resulting in * a negative length. This can also occur when tcp opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. */ len = (long)ulmin(so->so_snd.sb_cc, win) - off; if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) { taop = &tao_noncached; bzero(taop, sizeof(*taop)); } /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data and if we don't * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { flags &= ~TH_SYN; off--, len++; if (len > 0 && tp->t_state == TCPS_SYN_SENT && taop->tao_ccsent == 0) return 0; } /* * Be careful not to send data and/or FIN on SYN segments * in cases when no CC option will be sent. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if ((flags & TH_SYN) && ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) || ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) { len = 0; flags &= ~TH_FIN; } if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. */ len = 0; if (win == 0) { callout_stop(tp->tt_rexmt); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!callout_active(tp->tt_persist)) tcp_setpersist(tp); } } /* * len will be >= 0 after this point. Truncate to the maximum * segment length and ensure that FIN is removed if the length * no longer contains the last data byte. */ if (len > tp->t_maxseg) { len = tp->t_maxseg; sendalot = 1; } if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; win = sbspace(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) * - we need to retransmit */ if (len) { if (len == tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.sb_cc && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if (tp->t_force) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments, or at least 50% of the maximum possible * window, then want to send a window update to peer. */ if (win > 0) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) - (tp->rcv_adv - tp->rcv_nxt); if (adv >= (long) (2 * tp->t_maxseg)) goto send; if (2 * adv >= (long) so->so_rcv.sb_hiwat) goto send; } /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * callout_active(tp->tt_persist) * is true when we are in persist state. * tp->t_force * is set when we are called to send a persist packet. * callout_active(tp->tt_rexmt) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) && !callout_active(tp->tt_persist)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ return (0); send: /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; if ((tp->t_flags & TF_NOOPT) == 0) { u_short mss; opt[0] = TCPOPT_MAXSEG; opt[1] = TCPOLEN_MAXSEG; mss = htons((u_short) tcp_mssopt(tp)); (void)memcpy(opt + 2, &mss, sizeof(mss)); optlen = TCPOLEN_MAXSEG; if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE))) { *((u_int32_t *)(opt + optlen)) = htonl( TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | tp->request_r_scale); optlen += 4; } } } /* * Send a timestamp and echo-reply if this is a SYN and our side * wants to use timestamps (TF_REQ_TSTMP is set) or both our side * and our peer have sent timestamps in our SYN's. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (flags & TH_RST) == 0 && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_TSTMP))) { u_int32_t *lp = (u_int32_t *)(opt + optlen); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); *lp++ = htonl(ticks); *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; } /* * Send `CC-family' options if our side wants to use them (TF_REQ_CC), * options are allowed (!TF_NOOPT) and it's not a RST. */ if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && (flags & TH_RST) == 0) { switch (flags & (TH_SYN|TH_ACK)) { /* * This is a normal ACK, send CC if we received CC before * from our peer. */ case TH_ACK: if (!(tp->t_flags & TF_RCVD_CC)) break; /*FALLTHROUGH*/ /* * We can only get here in T/TCP's SYN_SENT* state, when * we're a sending a non-SYN segment without waiting for * the ACK of our SYN. A check above assures that we only * do this if our peer understands T/TCP. */ case 0: opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_CC; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); optlen += 4; break; /* * This is our initial SYN, check whether we have to use * CC or CC.new. */ case TH_SYN: opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = tp->t_flags & TF_SENDCCNEW ? TCPOPT_CCNEW : TCPOPT_CC; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); optlen += 4; break; /* * This is a SYN,ACK; send CC and CC.echo if we received * CC from our peer. */ case (TH_SYN|TH_ACK): if (tp->t_flags & TF_RCVD_CC) { opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_CC; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); optlen += 4; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_CCECHO; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_recv); optlen += 4; } break; } } hdrlen += optlen; #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #ifdef IPSEC ipoptlen += ipsec_hdrsiz_tcp(tp); #endif /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxopd length. * Clear the FIN bit because we cut off the tail of * the segment. */ if (len + optlen + ipoptlen > tp->t_maxopd) { /* * If there is still more to send, don't close the connection. */ flags &= ~TH_FIN; len = tp->t_maxopd - optlen - ipoptlen; sendalot = 1; } /*#ifdef DIAGNOSTIC*/ #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); /*#endif*/ /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from * the template for sends on this connection. */ if (len) { if (tp->t_force && len == 1) tcpstat.tcps_sndprobe++; else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { tcpstat.tcps_sndrexmitpack++; tcpstat.tcps_sndrexmitbyte += len; } else { tcpstat.tcps_sndpack++; tcpstat.tcps_sndbyte += len; } #ifdef notyet if ((m = m_copypack(so->so_snd.sb_mb, off, (int)len, max_linkhdr + hdrlen)) == 0) { error = ENOBUFS; goto out; } /* * m_copypack left space for our hdr; use it. */ m->m_len += hdrlen; m->m_data -= hdrlen; #else MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) { error = ENOBUFS; goto out; } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); error = ENOBUFS; goto out; } } #endif m->m_data += max_linkhdr; m->m_len = hdrlen; if (len <= MHLEN - hdrlen - max_linkhdr) { m_copydata(so->so_snd.sb_mb, off, (int) len, mtod(m, caddr_t) + hdrlen); m->m_len += len; } else { m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); if (m->m_next == 0) { (void) m_free(m); error = ENOBUFS; goto out; } } #endif /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only * give data to the user when a buffer fills or * a PUSH comes in.) */ if (off + len == so->so_snd.sb_cc) flags |= TH_PUSH; } else { if (tp->t_flags & TF_ACKNOW) tcpstat.tcps_sndacks++; else if (flags & (TH_SYN|TH_FIN|TH_RST)) tcpstat.tcps_sndctrl++; else if (SEQ_GT(tp->snd_up, tp->snd_una)) tcpstat.tcps_sndurg++; else tcpstat.tcps_sndwinup++; MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) { error = ENOBUFS; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { MH_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_create_mbuf_from_socket(so, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcp_fillheaders(tp, ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)(ip + 1); /* this picks up the pseudo header (w/o the length) */ tcp_fillheaders(tp, ip, th); } /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only * packets, we do not want the sequence number of the * retransmitted packet, we want the sequence number * of the next unsent octet. So, if there is no data * (and no SYN or FIN), use snd_max instead of snd_nxt * when filling in ti_seq. But if we are in persist * state, snd_max might reflect one byte beyond the * right edge of the window, so use snd_nxt in that * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (len || (flags & (TH_SYN|TH_FIN)) || callout_active(tp->tt_persist)) th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. */ if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) win = 0; if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) win = (long)(tp->rcv_adv - tp->rcv_nxt); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; th->th_win = htons((u_short) (win>>tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised * a 0 window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is attempting * to read more data then can be buffered prior to transmitting on * the connection. */ if (win == 0) tp->t_flags |= TF_RXWIN0SENT; else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull * the urgent pointer to the left edge of the send window * so that it doesn't drift into the send window on sequence * number wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ /* * Put TCP length in extended header, and then * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #ifdef INET6 if (isipv6) /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), sizeof(struct tcphdr) + optlen + len); else #endif /* INET6 */ { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); if (len + optlen) th->th_sum = in_addword(th->th_sum, htons((u_short)(optlen + len))); /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } /* * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. */ if (tp->t_force == 0 || !callout_active(tp->tt_persist)) { tcp_seq startseq = tp->snd_nxt; /* * Advance snd_nxt over sequence space of this segment. */ if (flags & (TH_SYN|TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. */ if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; tcpstat.tcps_segstimed++; } } /* * Set retransmit timer if not currently set, * and not doing a pure ack or a keep-alive probe. * Initial value for retransmit timer is smoothed * round-trip time + 2 * round-trip time variance. * Initialize shift counter which is used for backoff * of retransmit time. */ if (!callout_active(tp->tt_rexmt) && tp->snd_nxt != tp->snd_una) { if (callout_active(tp->tt_persist)) { callout_stop(tp->tt_persist); tp->t_rxtshift = 0; } callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); } } else { /* * Persist case, update snd_max but since we are in * persist mode (no window) we do not update snd_nxt. */ int xlen = len; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) tp->snd_max = tp->snd_nxt + len; } #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #endif /* * Fill in IP length and desired time to live and * send to IP level. There should be a better way * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before cksum calcuration, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. * Also, desired default hop limit might be changed via * Neighbor Discovery. */ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, tp->t_inpcb->in6p_route.ro_rt ? tp->t_inpcb->in6p_route.ro_rt->rt_ifp : NULL); /* TODO: IPv6 IP6TOS_ECT bit on */ -#ifdef IPSEC - if (ipsec_setsocket(m, so) != 0) { - m_freem(m); - error = ENOBUFS; - goto out; - } -#endif /*IPSEC*/ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &tp->t_inpcb->in6p_route, - (so->so_options & SO_DONTROUTE), NULL, NULL); + (so->so_options & SO_DONTROUTE), NULL, NULL, + tp->t_inpcb); } else #endif /* INET6 */ { struct rtentry *rt; ip->ip_len = m->m_pkthdr.len; #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) ip->ip_ttl = in6_selecthlim(tp->t_inpcb, tp->t_inpcb->in6p_route.ro_rt ? tp->t_inpcb->in6p_route.ro_rt->rt_ifp : NULL); else #endif /* INET6 */ ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */ ip->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */ /* * See if we should do MTU discovery. We do it only if the following * are true: * 1) we have a valid route to the destination * 2) the MTU is not locked (if it is, then discovery has been * disabled) */ if (path_mtu_discovery && (rt = tp->t_inpcb->inp_route.ro_rt) && rt->rt_flags & RTF_UP && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { ip->ip_off |= IP_DF; } -#ifdef IPSEC - ipsec_setsocket(m, so); -#endif /*IPSEC*/ error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, - (so->so_options & SO_DONTROUTE), 0); + (so->so_options & SO_DONTROUTE), 0, tp->t_inpcb); } if (error) { /* * We know that the packet was lost, so back out the * sequence number advance, if any. */ if (tp->t_force == 0 || !callout_active(tp->tt_persist)) { /* * No need to check for TH_FIN here because * the TF_SENTFIN flag handles that case. */ if ((flags & TH_SYN) == 0) tp->snd_nxt -= len; } out: if (error == ENOBUFS) { if (!callout_active(tp->tt_rexmt) && !callout_active(tp->tt_persist)) callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); tcp_quench(tp->t_inpcb, 0); return (0); } if (error == EMSGSIZE) { /* * ip_output() will have already fixed the route * for us. tcp_mtudisc() will, as its last action, * initiate retransmission, so it is important to * not do so here. */ tcp_mtudisc(tp->t_inpcb, 0); return 0; } if ((error == EHOSTUNREACH || error == ENETDOWN) && TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; return (0); } return (error); } tcpstat.tcps_sndtotal++; /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, * then remember the size of the advertised window. * Any pending ACK has now been sent. */ if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + win; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~TF_ACKNOW; if (tcp_delack_enabled) callout_stop(tp->tt_delack); #if 0 /* * This completely breaks TCP if newreno is turned on. What happens * is that if delayed-acks are turned on on the receiver, this code * on the transmitter effectively destroys the TCP window, forcing * it to four packets (1.5Kx4 = 6K window). */ if (sendalot && (!tcp_do_newreno || --maxburst)) goto again; #endif if (sendalot) goto again; return (0); } void tcp_setpersist(tp) register struct tcpcb *tp; { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; int tt; if (callout_active(tp->tt_rexmt)) panic("tcp_setpersist: retransmit pending"); /* * Start/restart persistance timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], TCPTV_PERSMIN, TCPTV_PERSMAX); callout_reset(tp->tt_persist, tt, tcp_timer_persist, tp); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } Index: head/sys/netinet/tcp_reass.c =================================================================== --- head/sys/netinet/tcp_reass.c (revision 105193) +++ head/sys/netinet/tcp_reass.c (revision 105194) @@ -1,2790 +1,2790 @@ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 * $FreeBSD$ */ #include "opt_ipfw.h" /* for ipfw_fwd */ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include "opt_tcp_input.h" #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #include #include #include /* for ICMP_BANDLIM */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef IPSEC #include #include #include #endif /*IPSEC*/ #include MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry"); static const int tcprexmtthresh = 3; tcp_cc tcp_ccgen; struct tcpstat tcpstat; SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); static int log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &log_in_vain, 0, "Log all incoming TCP connections"); static int blackhole = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, &blackhole, 0, "Do not send RST when dropping refused connections"); int tcp_delack_enabled = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, &tcp_delack_enabled, 0, "Delay ACK to try and piggyback it onto a data packet"); #ifdef TCP_DROP_SYNFIN static int drop_synfin = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); #endif struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; struct mtx *tcbinfo_mtx; static void tcp_dooptions(struct tcpopt *, u_char *, int, int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 #define ND6_HINT(tp) \ do { \ if ((tp) && (tp)->t_inpcb && \ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \ (tp)->t_inpcb->in6p_route.ro_rt) \ nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \ } while (0) #else #define ND6_HINT(tp) #endif /* * Indicate whether this ack should be delayed. We can delay the ack if * - delayed acks are enabled and * - there is no delayed ack timer in progress and * - our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. */ #define DELAY_ACK(tp) \ (tcp_delack_enabled && !callout_pending(tp->tt_delack) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) static int tcp_reass(tp, th, tlenp, m) register struct tcpcb *tp; register struct tcphdr *th; int *tlenp; struct mbuf *m; { struct tseg_qent *q; struct tseg_qent *p = NULL; struct tseg_qent *nq; struct tseg_qent *te; struct socket *so = tp->t_inpcb->inp_socket; int flags; /* * Call with th==0 after become established to * force pre-ESTABLISHED data up to user socket. */ if (th == 0) goto present; /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ, M_NOWAIT); if (te == NULL) { tcpstat.tcps_rcvmemdrop++; m_freem(m); return (0); } /* * Find a segment which begins after this one does. */ LIST_FOREACH(q, &tp->t_segq, tqe_q) { if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) break; p = q; } /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us. */ if (p != NULL) { register int i; /* conversion to int (in i) handles seq wraparound */ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; if (i > 0) { if (i >= *tlenp) { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); FREE(te, M_TSEGQ); /* * Try to present any queued data * at the left window edge to the user. * This is needed after the 3-WHS * completes. */ goto present; /* ??? */ } m_adj(m, i); *tlenp -= i; th->th_seq += i; } } tcpstat.tcps_rcvoopack++; tcpstat.tcps_rcvoobyte += *tlenp; /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ while (q) { register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; if (i <= 0) break; if (i < q->tqe_len) { q->tqe_th->th_seq += i; q->tqe_len -= i; m_adj(q->tqe_m, i); break; } nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); FREE(q, M_TSEGQ); q = nq; } /* Insert the new segment queue entry into place. */ te->tqe_m = m; te->tqe_th = th; te->tqe_len = *tlenp; if (p == NULL) { LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); } else { LIST_INSERT_AFTER(p, te, tqe_q); } present: /* * Present data to user, advancing rcv_nxt through * completed sequence space. */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); q = LIST_FIRST(&tp->t_segq); if (!q || q->tqe_th->th_seq != tp->rcv_nxt) return (0); do { tp->rcv_nxt += q->tqe_len; flags = q->tqe_th->th_flags & TH_FIN; nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); if (so->so_state & SS_CANTRCVMORE) m_freem(q->tqe_m); else sbappend(&so->so_rcv, q->tqe_m); FREE(q, M_TSEGQ); q = nq; } while (q && q->tqe_th->th_seq == tp->rcv_nxt); ND6_HINT(tp); sorwakeup(so); return (flags); } /* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ #ifdef INET6 int tcp6_input(mp, offp, proto) struct mbuf **mp; int *offp, proto; { register struct mbuf *m = *mp; struct in6_ifaddr *ia6; IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ia6 = ip6_getdstifaddr(m); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); return IPPROTO_DONE; } tcp_input(m, *offp); return IPPROTO_DONE; } #endif void tcp_input(m, off0) register struct mbuf *m; int off0; { register struct tcphdr *th; register struct ip *ip = NULL; register struct ipovly *ipov; register struct inpcb *inp = NULL; u_char *optp = NULL; int optlen = 0; int len, tlen, off; int drop_hdrlen; register struct tcpcb *tp = 0; register int thflags; struct socket *so = 0; int todrop, acked, ourfinisacked, needoutput = 0; u_long tiwin; struct tcpopt to; /* options in this segment */ struct rmxp_tao *taop; /* pointer to our TAO cache entry */ struct rmxp_tao tao_noncached; /* in case there's no cached entry */ int headlocked = 0; struct sockaddr_in *next_hop = NULL; int rstreason; /* For badport_bandlim accounting purposes */ struct ip6_hdr *ip6 = NULL; #ifdef INET6 int isipv6; #else const int isipv6 = 0; #endif #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[40]; struct tcphdr tcp_savetcp; short ostate = 0; #endif #ifdef MAC int error; #endif /* Grab info from MT_TAG mbufs prepended to the chain. */ for (;m && m->m_type == MT_TAG; m = m->m_next) { - if (m->m_tag_id == PACKET_TAG_IPFORWARD) + if (m->_m_tag_id == PACKET_TAG_IPFORWARD) next_hop = (struct sockaddr_in *)m->m_hdr.mh_data; } #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif bzero((char *)&to, sizeof(to)); tcpstat.tcps_rcvtotal++; if (isipv6) { /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { tcpstat.tcps_rcvbadsum++; goto drop; } th = (struct tcphdr *)((caddr_t)ip6 + off0); /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } } else { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { tcpstat.tcps_rcvshort++; return; } } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ip->ip_len; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + ip->ip_len + IPPROTO_TCP)); th->th_sum ^= 0xffff; } else { /* * Checksum extended TCP header and data. */ len = sizeof (struct ip) + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = (u_short)tlen; ipov->ih_len = htons(ipov->ih_len); th->th_sum = in_cksum(m, len); } if (th->th_sum) { tcpstat.tcps_rcvbadsum++; goto drop; } #ifdef INET6 /* Re-initialization for later version check */ ip->ip_v = IPVERSION; #endif } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { tcpstat.tcps_rcvbadoff++; goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { if (isipv6) { IP6_EXTHDR_CHECK(m, off0, off, ); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); } else { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { tcpstat.tcps_rcvshort++; return; } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); } } optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = th->th_flags; #ifdef TCP_DROP_SYNFIN /* * If the drop_synfin option is enabled, drop all packets with * both the SYN and FIN bits set. This prevents e.g. nmap from * identifying the TCP/IP stack. * * This is a violation of the TCP specification. */ if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) goto drop; #endif /* * Convert TCP protocol specific fields to host format. */ th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); /* * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options, * until after ip6_savecontrol() is called and before other functions * which don't want those proto headers. * Because ip6_savecontrol() is going to parse the mbuf to * search for data to be passed up to user-land, it wants mbuf * parameters to be unchanged. * XXX: the call of ip6_savecontrol() has been obsoleted based on * latest version of the advanced API (20020110). */ drop_hdrlen = off0 + off; /* * Locate pcb for segment. */ INP_INFO_WLOCK(&tcbinfo); headlocked = 1; findpcb: /* IPFIREWALL_FORWARD section */ if (next_hop != NULL && isipv6 == 0) { /* IPv6 support is not yet */ /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); if (!inp) { /* It's new. Try find the ambushing socket. */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, 1, m->m_pkthdr.rcvif); } } else { if (isipv6) inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, 1, m->m_pkthdr.rcvif); else inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); } #ifdef IPSEC if (isipv6) { if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) { ipsec6stat.in_polvio++; goto drop; } } else { if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) { ipsecstat.in_polvio++; goto drop; } } #endif /* * If the state is CLOSED (i.e., TCB does not exist) then * all data in the incoming segment is discarded. * If the TCB exists but is in CLOSED state, it is embryonic, * but should either do a listen or a connect soon. */ if (inp == NULL) { if (log_in_vain) { #ifdef INET6 char dbuf[INET6_ADDRSTRLEN+2], sbuf[INET6_ADDRSTRLEN+2]; #else char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; #endif if (isipv6) { strcpy(dbuf, "["); strcpy(sbuf, "["); strcat(dbuf, ip6_sprintf(&ip6->ip6_dst)); strcat(sbuf, ip6_sprintf(&ip6->ip6_src)); strcat(dbuf, "]"); strcat(sbuf, "]"); } else { strcpy(dbuf, inet_ntoa(ip->ip_dst)); strcpy(sbuf, inet_ntoa(ip->ip_src)); } switch (log_in_vain) { case 1: if (thflags & TH_SYN) log(LOG_INFO, "Connection attempt to TCP %s:%d " "from %s:%d\n", dbuf, ntohs(th->th_dport), sbuf, ntohs(th->th_sport)); break; case 2: log(LOG_INFO, "Connection attempt to TCP %s:%d " "from %s:%d flags:0x%x\n", dbuf, ntohs(th->th_dport), sbuf, ntohs(th->th_sport), thflags); break; default: break; } } if (blackhole) { switch (blackhole) { case 1: if (thflags & TH_SYN) goto drop; break; case 2: goto drop; default: goto drop; } } rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_LOCK(inp); tp = intotcpcb(inp); if (tp == 0) { INP_UNLOCK(inp); rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } if (tp->t_state == TCPS_CLOSED) goto drop; /* Unscale the window into a 32-bit value. */ if ((thflags & TH_SYN) == 0) tiwin = th->th_win << tp->snd_scale; else tiwin = th->th_win; so = inp->inp_socket; #ifdef MAC error = mac_check_socket_deliver(so, m); if (error) goto drop; #endif if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { struct in_conninfo inc; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; if (isipv6) bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); else bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif /* skip if this isn't a listen socket */ if ((so->so_options & SO_ACCEPTCONN) == 0) goto after_listen; #ifdef INET6 inc.inc_isipv6 = isipv6; #endif if (isipv6) { inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; inc.inc6_route.ro_rt = NULL; /* XXX */ } else { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; inc.inc_route.ro_rt = NULL; /* XXX */ } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; /* * If the state is LISTEN then ignore segment if it contains * a RST. If the segment contains an ACK then it is bad and * send a RST. If it does not contain a SYN then it is not * interesting; drop it. * * If the state is SYN_RECEIVED (syncache) and seg contains * an ACK, but not for our SYN/ACK, send a RST. If the seg * contains a RST, check the sequence number to see if it * is a valid reset segment. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { if (!syncache_expand(&inc, th, &so, m)) { /* * No syncache entry, or ACK was not * for our SYN/ACK. Send a RST. */ tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (so == NULL) { /* * Could not complete 3-way handshake, * connection is being closed down, and * syncache will free mbuf. */ INP_UNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return; } /* * Socket is created in state SYN_RECEIVED. * Continue processing segment. */ INP_UNLOCK(inp); inp = sotoinpcb(so); INP_LOCK(inp); tp = intotcpcb(inp); /* * This is what would have happened in * tcp_output() when the SYN,ACK was sent. */ tp->snd_up = tp->snd_una; tp->snd_max = tp->snd_nxt = tp->iss + 1; tp->last_ack_sent = tp->rcv_nxt; /* * XXX possible bug - it doesn't appear that tp->snd_wnd is unscaled * until the _second_ ACK is received: * rcv SYN (set wscale opts) --> send SYN/ACK, set snd_wnd = window. * rcv ACK, calculate tiwin --> process SYN_RECEIVED, determine wscale, * move to ESTAB, set snd_wnd to tiwin. */ tp->snd_wnd = tiwin; /* unscaled */ goto after_listen; } if (thflags & TH_RST) { syncache_chkrst(&inc, th); goto drop; } if (thflags & TH_ACK) { syncache_badack(&inc); tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } goto drop; } /* * Segment's flags are (SYN) or (SYN|FIN). */ #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !ip6_use_deprecated) { struct in6_ifaddr *ia6; if ((ia6 = ip6_getdstifaddr(m)) && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { INP_UNLOCK(inp); tp = NULL; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } } #endif /* * If it is from this socket, drop it, it must be forged. * Don't bother responding if the destination was a broadcast. */ if (th->th_dport == th->th_sport) { if (isipv6) { if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) goto drop; } else { if (ip->ip_dst.s_addr == ip->ip_src.s_addr) goto drop; } } /* * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN * * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } /* * SYN appears to be valid; create compressed TCP state * for syncache, or perform t/tcp connection. */ if (so->so_qlen <= so->so_qlimit) { tcp_dooptions(&to, optp, optlen, 1); if (!syncache_add(&inc, &to, th, &so, m)) goto drop; if (so == NULL) { /* * Entry added to syncache, mbuf used to * send SYN,ACK packet. */ KASSERT(headlocked, ("headlocked")); INP_UNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return; } /* * Segment passed TAO tests. */ INP_UNLOCK(inp); inp = sotoinpcb(so); INP_LOCK(inp); tp = intotcpcb(inp); tp->snd_wnd = tiwin; tp->t_starttime = ticks; tp->t_state = TCPS_ESTABLISHED; /* * If there is a FIN, or if there is data and the * connection is local, then delay SYN,ACK(SYN) in * the hope of piggy-backing it on a response * segment. Otherwise must send ACK now in case * the other side is slow starting. */ if (DELAY_ACK(tp) && ((thflags & TH_FIN) || (tlen != 0 && ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || (!isipv6 && in_localaddr(inp->inp_faddr)))))) { callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); tp->t_flags |= TF_NEEDSYN; } else tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcpstat.tcps_connects++; soisconnected(so); goto trimthenstep6; } goto drop; } after_listen: /* XXX temp debugging */ /* should not happen - syncache should pick up these connections */ if (tp->t_state == TCPS_LISTEN) panic("tcp_input: TCPS_LISTEN"); /* * Segment received on connection. * Reset idle time and keep-alive timer. */ tp->t_rcvtime = ticks; if (TCPS_HAVEESTABLISHED(tp->t_state)) callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); /* * Process options. * XXX this is tradtitional behavior, may need to be cleaned up. */ tcp_dooptions(&to, optp, optlen, thflags & TH_SYN); if (thflags & TH_SYN) { if (to.to_flags & TOF_SCALE) { tp->t_flags |= TF_RCVD_SCALE; tp->requested_s_scale = to.to_requested_s_scale; } if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = ticks; } if (to.to_flags & (TOF_CC|TOF_CCNEW)) tp->t_flags |= TF_RCVD_CC; if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED above, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && /* * Using the CC option is compulsory if once started: * the segment is OK if no T/TCP was negotiated or * if the segment has a CC option equal to CCrecv */ ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || ((to.to_flags & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) && th->th_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && tp->t_dupacks < tcprexmtthresh) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; /* * this is a pure ack for outstanding data. */ ++tcpstat.tcps_predack; /* * "bad retransmit" recovery */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); tp->snd_una = th->th_ack; tp->t_dupacks = 0; m_freem(m); ND6_HINT(tp); /* some progress has been done */ /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ if (tp->snd_una == tp->snd_max) callout_stop(tp->tt_rexmt); else if (!callout_active(tp->tt_persist)) callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); sowwakeup(so); if (so->so_snd.sb_cc) (void) tcp_output(tp); INP_UNLOCK(inp); return; } } else if (th->th_ack == tp->snd_una && LIST_EMPTY(&tp->t_segq) && tlen <= sbspace(&so->so_rcv)) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; /* * this is a pure, in-sequence data packet * with nothing on the reassembly queue and * we have enough buffer space to take it. */ ++tcpstat.tcps_preddat; tp->rcv_nxt += tlen; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); /* some progress has been done */ /* * Add data to socket buffer. */ if (so->so_state & SS_CANTRCVMORE) { m_freem(m); } else { m_adj(m, drop_hdrlen); /* delayed header drop */ sbappend(&so->so_rcv, m); } sorwakeup(so); if (DELAY_ACK(tp)) { callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } INP_UNLOCK(inp); return; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ { int win; win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } break; /* * If the state is SYN_SENT: * if seg contains an ACK, but not for our SYN, drop the input. * if seg contains a RST, then drop the connection. * if seg does not contain SYN, then drop it. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) { taop = &tao_noncached; bzero(taop, sizeof(*taop)); } if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { /* * If we have a cached CCsent for the remote host, * hence we haven't just crashed and restarted, * do not send a RST. This may be a retransmission * from the other side after our earlier ACK was lost. * Our new SYN, when it arrives, will serve as the * needed ACK. */ if (taop->tao_ccsent != 0) goto drop; else { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } } if (thflags & TH_RST) { if (thflags & TH_ACK) tp = tcp_drop(tp, ECONNREFUSED); goto drop; } if ((thflags & TH_SYN) == 0) goto drop; tp->snd_wnd = th->th_win; /* initial send window */ tp->cc_recv = to.to_cc; /* foreign CC */ tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { /* * Our SYN was acked. If segment contains CC.ECHO * option, check it to make sure this segment really * matches our SYN. If not, just drop it as old * duplicate, but send an RST if we're still playing * by the old rules. If no CC.ECHO option, make sure * we don't get fooled into using T/TCP. */ if (to.to_flags & TOF_CCECHO) { if (tp->cc_send != to.to_ccecho) { if (taop->tao_ccsent != 0) goto drop; else { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } } } else tp->t_flags &= ~TF_RCVD_CC; tcpstat.tcps_connects++; soisconnected(so); #ifdef MAC mac_set_socket_peer_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } /* Segment is acceptable, update cache if undefined. */ if (taop->tao_ccsent == 0) taop->tao_ccsent = to.to_ccecho; tp->rcv_adv += tp->rcv_wnd; tp->snd_una++; /* SYN is acked */ /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp) && tlen != 0) callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); else tp->t_flags |= TF_ACKNOW; /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. If segment contains CC option * and there is a cached CC, apply TAO test. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* * If there was no CC option, clear cached CC value. */ tp->t_flags |= TF_ACKNOW; callout_stop(tp->tt_rexmt); if (to.to_flags & TOF_CC) { if (taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) { /* * update cache and make transition: * SYN-SENT -> ESTABLISHED* * SYN-SENT* -> FIN-WAIT-1* */ taop->tao_cc = to.to_cc; tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } tp->t_flags |= TF_NEEDSYN; } else tp->t_state = TCPS_SYN_RECEIVED; } else { /* CC.NEW or no option => invalidate cache */ taop->tao_cc = 0; tp->t_state = TCPS_SYN_RECEIVED; } } trimthenstep6: /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; tcpstat.tcps_rcvpackafterwin++; tcpstat.tcps_rcvbyteafterwin += todrop; } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * if segment contains a SYN and CC [not CC.NEW] option: * if state == TIME_WAIT and connection duration > MSL, * drop packet and send RST; * * if SEG.CC > CCrecv then is new SYN, and can implicitly * ack the FIN (and data) in retransmission queue. * Complete close and delete TCPCB. Then reprocess * segment, hoping to find new TCPCB in LISTEN state; * * else must be old SYN; drop it. * else do normal processing. */ case TCPS_LAST_ACK: case TCPS_CLOSING: case TCPS_TIME_WAIT: if ((thflags & TH_SYN) && (to.to_flags & TOF_CC) && tp->cc_recv != 0) { if (tp->t_state == TCPS_TIME_WAIT && (ticks - tp->t_starttime) > tcp_msl) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } if (CC_GT(to.to_cc, tp->cc_recv)) { tp = tcp_close(tp); goto findpcb; } else goto drop; } break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. * * * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * Note: this does not take into account delayed ACKs, so * we should test against last_ack_sent instead of rcv_nxt. * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. * If we have multiple segments in flight, the intial reset * segment sequence numbers will be to the left of last_ack_sent, * but they will eventually catch up. * In any case, it never made sense to trim reset segments to * fit the receive window since RFC 1122 says: * 4.2.2.12 RST Segment: RFC-793 Section 3.4 * * A TCP SHOULD allow a received RST segment to include data. * * DISCUSSION * It has been suggested that a RST segment could contain * ASCII text that encoded and explained the cause of the * RST. No standard has yet been established for such * data. * * If the reset segment passes the sequence number test examine * the state: * SYN_RECEIVED STATE: * If passive open, return to LISTEN state. * If active open, inform user that connection was refused. * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: * Inform user that connection was reset, and close tcb. * CLOSING, LAST_ACK STATES: * Close the tcb. * TIME_WAIT STATE: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) { if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: tp->t_state = TCPS_CLOSED; tcpstat.tcps_drops++; tp = tcp_close(tp); break; case TCPS_CLOSING: case TCPS_LAST_ACK: tp = tcp_close(tp); break; case TCPS_TIME_WAIT: break; } } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += tlen; tcpstat.tcps_pawsdrop++; goto dropafterack; } } /* * T/TCP mechanism * If T/TCP was negotiated and the segment doesn't have CC, * or if its CC is wrong then drop the segment. * RST segments do not have to comply with this. */ if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) && ((to.to_flags & TOF_CC) == 0 || tp->cc_recv != to.to_cc)) goto dropafterack; /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += todrop; } else { tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += todrop; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { tp = tcp_close(tp); tcpstat.tcps_rcvafterclose++; rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { tcpstat.tcps_rcvpackafterwin++; if (todrop >= tlen) { tcpstat.tcps_rcvbyteafterwin += tlen; /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if (thflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && SEQ_GT(th->th_seq, tp->rcv_nxt)) { tp = tcp_close(tp); goto findpcb; } /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; } else tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } /* * If a SYN is in the window, then this is an * error and we send an RST and drop the connection. */ if (thflags & TH_SYN) { tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: tcpstat.tcps_connects++; soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } /* * Upon successful completion of 3-way handshake, * update cache.CC if it was undefined, pass any queued * data to the user, and advance state appropriately. */ if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL && taop->tao_cc == 0) taop->tao_cc = tp->cc_recv; /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { tcpstat.tcps_rcvdupack++; /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change), the ack is the biggest we've * seen and we've seen exactly our rexmt * threshhold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. */ if (!callout_active(tp->tt_rexmt) || th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (tcp_do_newreno && SEQ_LT(th->th_ack, tp->snd_recover)) { /* False retransmit, should not * cut window */ tp->snd_cwnd += tp->t_maxseg; tp->t_dupacks = 0; (void) tcp_output(tp); goto drop; } if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; tp->snd_recover = tp->snd_max; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tp->t_dupacks > tcprexmtthresh) { tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } } else tp->t_dupacks = 0; break; } /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (tcp_do_newreno) { int is_partialack = SEQ_LT(th->th_ack, tp->snd_recover); if (tp->t_dupacks >= tcprexmtthresh) { if (is_partialack) { tcp_newreno_partial_ack(tp, th); } else { /* * Window inflation should have left us * with approximately snd_ssthresh * outstanding data. * But in case we would be inclined to * send a burst, better to do it via * the slow start mechanism. */ if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) tp->snd_cwnd = tp->snd_max - th->th_ack + tp->t_maxseg; else tp->snd_cwnd = tp->snd_ssthresh; } } /* * Reset dupacks, except on partial acks in * fast recovery. */ if (!(tp->t_dupacks >= tcprexmtthresh && is_partialack)) tp->t_dupacks = 0; } else { if (tp->t_dupacks >= tcprexmtthresh && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_dupacks = 0; } if (SEQ_GT(th->th_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } } process_ACK: acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; /* XXX probably not required */ } /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { callout_stop(tp->tt_rexmt); needoutput = 1; } else if (!callout_active(tp->tt_persist)) callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * When new data is acked, open the congestion window. * If the window gives us less than ssthresh packets * in flight, open exponentially (maxseg per packet). * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ { register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; /* * If t_dupacks != 0 here, it indicates that we are still * in NewReno fast recovery mode, so we leave the congestion * window alone. */ if (!tcp_do_newreno || tp->t_dupacks == 0) tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<snd_scale); } if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); ourfinisacked = 1; } else { sbdrop(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } sowwakeup(so); tp->snd_una = th->th_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. */ if (so->so_state & SS_CANTRCVMORE) { soisdisconnected(so); callout_reset(tp->tt_2msl, tcp_maxidle, tcp_timer_2msl, tp); } tp->t_state = TCPS_FIN_WAIT_2; } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); /* Shorten TIME_WAIT [RFC-1644, p.28] */ if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) callout_reset(tp->tt_2msl, tp->t_rxtcur * TCPTV_TWTRUNC, tcp_timer_2msl, tp); else callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); soisdisconnected(so); } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { tp = tcp_close(tp); goto drop; } break; /* * In TIME_WAIT state the only thing that should arrive * is a retransmission of the remote FIN. Acknowledge * it and restart the finack timer. */ case TCPS_TIME_WAIT: callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); goto dropafterack; } } step6: /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ if (th->th_urp + so->so_rcv.sb_cc > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_state |= SS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (u_long)tlen #ifdef SO_OOBINLINE && (so->so_options & SO_OOBINLINE) == 0 #endif ) tcp_pulloutofband(so, th, m, drop_hdrlen); /* hdr drop is delayed */ } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp)) callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); if (so->so_state & SS_CANTRCVMORE) m_freem(m); else sbappend(&so->so_rcv, m); sorwakeup(so); } else { thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. */ len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN)) callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /*FALLTHROUGH*/ case TCPS_ESTABLISHED: tp->t_state = TCPS_CLOSE_WAIT; break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); /* Shorten TIME_WAIT [RFC-1644, p.28] */ if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) { callout_reset(tp->tt_2msl, tp->t_rxtcur * TCPTV_TWTRUNC, tcp_timer_2msl, tp); /* For transaction client, force ACK now. */ tp->t_flags |= TF_ACKNOW; } else callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); soisdisconnected(so); break; /* * In TIME_WAIT state restart the 2 MSL time_wait timer. */ case TCPS_TIME_WAIT: callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); break; } } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tcp_output(tp); INP_UNLOCK(inp); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); m_freem(m); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); INP_UNLOCK(inp); return; dropwithreset: /* * Generate a RST, dropping incoming segment. * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } /* IPv6 anycast check is done at tcp6_input() */ /* * Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; #ifdef TCPDEBUG if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp) INP_UNLOCK(inp); if (thflags & TH_ACK) /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); else { if (thflags & TH_SYN) tlen++; /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); return; drop: /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp) INP_UNLOCK(inp); m_freem(m); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); return; } /* * Parse TCP options and place in tcpopt. */ static void tcp_dooptions(to, cp, cnt, is_syn) struct tcpopt *to; u_char *cp; int cnt; { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!is_syn) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (! is_syn) continue; to->to_flags |= TOF_SCALE; to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; case TCPOPT_CC: if (optlen != TCPOLEN_CC) continue; to->to_flags |= TOF_CC; bcopy((char *)cp + 2, (char *)&to->to_cc, sizeof(to->to_cc)); to->to_cc = ntohl(to->to_cc); break; case TCPOPT_CCNEW: if (optlen != TCPOLEN_CC) continue; if (!is_syn) continue; to->to_flags |= TOF_CCNEW; bcopy((char *)cp + 2, (char *)&to->to_cc, sizeof(to->to_cc)); to->to_cc = ntohl(to->to_cc); break; case TCPOPT_CCECHO: if (optlen != TCPOLEN_CC) continue; if (!is_syn) continue; to->to_flags |= TOF_CCECHO; bcopy((char *)cp + 2, (char *)&to->to_ccecho, sizeof(to->to_ccecho)); to->to_ccecho = ntohl(to->to_ccecho); break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ static void tcp_pulloutofband(so, th, m, off) struct socket *so; struct tcphdr *th; register struct mbuf *m; int off; /* delayed to be droped hdrlen */ { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == 0) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_xmit_timer(tp, rtt) register struct tcpcb *tp; int rtt; { register int delta; tcpstat.tcps_rttupdated++; tp->t_rttupdated++; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing * interface without forcing IP to fragment; if bigger than * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES * to utilize large mbufs. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * Also take into account the space needed for options that we * send regularly. Make maxseg shorter by that amount to assure * that we can send maxseg amount of data even when the options * are present. Store the upper limit of the length of options plus * data in maxopd. * * NOTE that this routine is only called when we process an incoming * segment, for outgoing segments only tcp_mssopt is called. * * In case of T/TCP, we call this routine during implicit connection * setup as well (offer = -1), to initialize maxseg from the cached * MSS of our peer. */ void tcp_mss(tp, offer) struct tcpcb *tp; int offer; { register struct rtentry *rt; struct ifnet *ifp; register int rtt, mss; u_long bufsize; struct inpcb *inp = tp->t_inpcb; struct socket *so; struct rmxp_tao *taop; int origoffer = offer; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const int isipv6 = 0; const size_t min_protoh = sizeof (struct tcpiphdr); #endif if (isipv6) rt = tcp_rtlookup6(&inp->inp_inc); else rt = tcp_rtlookup(&inp->inp_inc); if (rt == NULL) { tp->t_maxopd = tp->t_maxseg = isipv6 ? tcp_v6mssdflt : tcp_mssdflt; return; } ifp = rt->rt_ifp; so = inp->inp_socket; taop = rmx_taop(rt->rt_rmx); /* * Offer == -1 means that we didn't receive SYN yet, * use cached value in that case; */ if (offer == -1) offer = taop->tao_mssopt; /* * Offer == 0 means that there was no MSS on the SYN segment, * in this case we use tcp_mssdflt. */ if (offer == 0) offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt; else /* * Sanity check: make sure that maxopd will be large * enough to allow some data on segments even is the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. */ offer = max(offer, 64); taop->tao_mssopt = offer; /* * While we're here, check if there's an initial rtt * or rttvar. Convert from the route-table units * to scaled multiples of the slow timeout timer. */ if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { /* * XXX the lock bit for RTT indicates that the value * is also a minimum value; this is subject to time. */ if (rt->rt_rmx.rmx_locks & RTV_RTT) tp->t_rttmin = rtt / (RTM_RTTUNIT / hz); tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; tcpstat.tcps_usedrtt++; if (rt->rt_rmx.rmx_rttvar) { tp->t_rttvar = rt->rt_rmx.rmx_rttvar / (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); tcpstat.tcps_usedrttvar++; } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } /* * if there's an mtu associated with the route, use it * else, use the link mtu. */ if (rt->rt_rmx.rmx_mtu) mss = rt->rt_rmx.rmx_mtu - min_protoh; else { if (isipv6) { mss = nd_ifinfo[rt->rt_ifp->if_index].linkmtu - min_protoh; if (!in6_localaddr(&inp->in6p_faddr)) mss = min(mss, tcp_v6mssdflt); } else { mss = ifp->if_mtu - min_protoh; if (!in_localaddr(inp->inp_faddr)) mss = min(mss, tcp_mssdflt); } } mss = min(mss, offer); /* * maxopd stores the maximum length of data AND options * in a segment; maxseg is the amount of data in a normal * segment. We need to store this value (maxopd) apart * from maxseg, because now every segment carries options * and thus we normally have somewhat less data in segments. */ tp->t_maxopd = mss; /* * In case of T/TCP, origoffer==-1 indicates, that no segments * were received yet. In this case we just guess, otherwise * we do the same as before T/TCP. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (origoffer == -1 || (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) mss -= TCPOLEN_TSTAMP_APPA; if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && (origoffer == -1 || (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) mss -= TCPOLEN_CC_APPA; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif /* * If there's a pipesize, change the socket buffer * to that size. Make the socket buffers an integral * number of mss units; if the mss is larger than * the socket buffer, decrease the mss. */ #ifdef RTV_SPIPE if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) #endif bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve(&so->so_snd, bufsize, so, NULL); } tp->t_maxseg = mss; #ifdef RTV_RPIPE if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) #endif bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve(&so->so_rcv, bufsize, so, NULL); } /* * Set the slow-start flight size depending on whether this * is a local network or not. */ if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || (!isipv6 && in_localaddr(inp->inp_faddr))) tp->snd_cwnd = mss * ss_fltsz_local; else tp->snd_cwnd = mss * ss_fltsz; if (rt->rt_rmx.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshhold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); tcpstat.tcps_usedssthresh++; } } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(tp) struct tcpcb *tp; { struct rtentry *rt; #ifdef INET6 int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const int isipv6 = 0; const size_t min_protoh = sizeof (struct tcpiphdr); #endif if (isipv6) rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc); else rt = tcp_rtlookup(&tp->t_inpcb->inp_inc); if (rt == NULL) return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt); return (rt->rt_ifp->if_mtu - min_protoh); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ static void tcp_newreno_partial_ack(tp, th) struct tcpcb *tp; struct tcphdr *th; { tcp_seq onxt = tp->snd_nxt; u_long ocwnd = tp->snd_cwnd; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); } Index: head/sys/netinet/tcp_subr.c =================================================================== --- head/sys/netinet/tcp_subr.c (revision 105193) +++ head/sys/netinet/tcp_subr.c (revision 105194) @@ -1,1695 +1,1690 @@ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #define _IP_VHL #include #include #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #include #ifdef INET6 #include #endif #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif #include #ifdef IPSEC #include #ifdef INET6 #include #endif #endif /*IPSEC*/ #include #include int tcp_mssdflt = TCP_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); #ifdef INET6 int tcp_v6mssdflt = TCP6_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_RW, &tcp_v6mssdflt , 0, "Default TCP Maximum Segment Size for IPv6"); #endif #if 0 static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); #endif int tcp_do_rfc1323 = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); int tcp_do_rfc1644 = 0; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); static int tcp_tcbhashsize = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, &tcbinfo.ipi_count, 0, "Number of active PCBs"); static int icmp_may_rst = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static int tcp_isn_reseed_interval = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); /* * TCP bandwidth limiting sysctls. Note that the default lower bound of * 1024 exists only for debugging. A good production default would be * something like 6100. */ static int tcp_inflight_enable = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW, &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); static int tcp_inflight_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW, &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); static int tcp_inflight_min = 1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW, &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW, &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); static void tcp_cleartaocache(void); static struct inpcb *tcp_notify(struct inpcb *, int); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 512 #endif /* * This is the actual shape of what we allocate using the zone * allocator. Doing it this way allows us to protect both structures * using the same generation count, and also eliminates the overhead * of allocating tcpcbs separately. By hiding the structure here, * we avoid changing most of the rest of the code (although it needs * to be changed, eventually, for greater efficiency). */ #define ALIGNMENT 32 #define ALIGNM1 (ALIGNMENT - 1) struct inp_tp { union { struct inpcb inp; char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; } inp_tp_u; struct tcpcb tcb; struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl; struct callout inp_tp_delack; }; #undef ALIGNMENT #undef ALIGNM1 /* * Tcp initialization */ void tcp_init() { int hashsize = TCBHASHSIZE; tcp_ccgen = 1; tcp_cleartaocache(); tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; tcp_rexmit_min = TCPTV_MIN; tcp_rexmit_slop = TCPTV_CPU_VAR; INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); LIST_INIT(&tcb); tcbinfo.listhead = &tcb; TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); if (!powerof2(hashsize)) { printf("WARNING: TCB hash size not a power of 2\n"); hashsize = 512; /* safe default */ } tcp_tcbhashsize = hashsize; tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); tcbinfo.porthashbase = hashinit(hashsize, M_PCB, &tcbinfo.porthashmask); tcbinfo.ipi_zone = uma_zcreate("tcpcb", sizeof(struct inp_tp), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) max_protohdr = TCP_MINPROTOHDR; if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR syncache_init(); } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcp_fillheaders(tp, ip_ptr, tcp_ptr) struct tcpcb *tp; void *ip_ptr; void *tcp_ptr; { struct inpcb *inp = tp->t_inpcb; struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr; #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = sizeof(struct tcphdr); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; tcp_hdr->th_sum = 0; } else #endif { struct ip *ip = (struct ip *) ip_ptr; ip->ip_vhl = IP_VHL_BORING; ip->ip_tos = 0; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = 0; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP)); } tcp_hdr->th_sport = inp->inp_lport; tcp_hdr->th_dport = inp->inp_fport; tcp_hdr->th_seq = 0; tcp_hdr->th_ack = 0; tcp_hdr->th_x2 = 0; tcp_hdr->th_off = 5; tcp_hdr->th_flags = 0; tcp_hdr->th_win = 0; tcp_hdr->th_urp = 0; } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcp_maketemplate(tp) struct tcpcb *tp; { struct mbuf *m; struct tcptemp *n; m = m_get(M_DONTWAIT, MT_HEADER); if (m == NULL) return (0); m->m_len = sizeof(struct tcptemp); n = mtod(m, struct tcptemp *); tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t); return (n); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == 0, then we make a copy * of the tcpiphdr at ti and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the * segment ti, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then ti must point to *inside* the mbuf. */ void tcp_respond(tp, ipgen, th, m, ack, seq, flags) struct tcpcb *tp; void *ipgen; register struct tcphdr *th; register struct mbuf *m; tcp_seq ack, seq; int flags; { register int tlen; int win = 0; struct route *ro = 0; struct route sro; struct ip *ip; struct tcphdr *nth; #ifdef INET6 struct route_in6 *ro6 = 0; struct route_in6 sro6; struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int ipflags = 0; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); #ifdef INET6 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp) { if (!(flags & TH_RST)) { win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } #ifdef INET6 if (isipv6) ro6 = &tp->t_inpcb->in6p_route; else #endif /* INET6 */ ro = &tp->t_inpcb->inp_route; } else { #ifdef INET6 if (isipv6) { ro6 = &sro6; bzero(ro6, sizeof *ro6); } else #endif /* INET6 */ { ro = &sro; bzero(ro, sizeof *ro); } } if (m == 0) { m = m_gethdr(M_DONTWAIT, MT_HEADER); if (m == NULL) return; tlen = 0; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else { m_freem(m->m_next); m->m_next = 0; m->m_data = (caddr_t)ipgen; /* m_len is set later */ tlen = 0; #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, n_short); #undef xchg } #ifdef INET6 if (isipv6) { ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + tlen)); tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); } else #endif { tlen += sizeof (struct tcpiphdr); ip->ip_len = tlen; ip->ip_ttl = ip_defttl; } m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = (struct ifnet *) 0; #ifdef MAC if (tp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ mac_create_mbuf_from_socket(tp->t_inpcb->inp_socket, m); } else { /* * XXXMAC: This will need to call a mac function that * modifies the mbuf label in place for TCP datagrams * not associated with a PCB. */ } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; nth->th_off = sizeof (struct tcphdr) >> 2; nth->th_flags = flags; if (tp) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #ifdef INET6 if (isipv6) { nth->th_sum = 0; nth->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen - sizeof(struct ip6_hdr)); ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, ro6 && ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL); } else #endif /* INET6 */ { nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); } #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif -#ifdef IPSEC - if (ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) { - m_freem(m); - return; - } -#endif #ifdef INET6 if (isipv6) { - (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL); + (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL, + tp ? tp->t_inpcb : NULL); if (ro6 == &sro6 && ro6->ro_rt) { RTFREE(ro6->ro_rt); ro6->ro_rt = NULL; } } else #endif /* INET6 */ { - (void) ip_output(m, NULL, ro, ipflags, NULL); + (void) ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL); if (ro == &sro && ro->ro_rt) { RTFREE(ro->ro_rt); ro->ro_rt = NULL; } } } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(inp) struct inpcb *inp; { struct inp_tp *it; register struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ it = (struct inp_tp *)inp; tp = &it->tcb; bzero((char *) tp, sizeof(struct tcpcb)); LIST_INIT(&tp->t_segq); tp->t_maxseg = tp->t_maxopd = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif /* INET6 */ tcp_mssdflt; /* Set up our timeouts. */ callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0); callout_init(tp->tt_persist = &it->inp_tp_persist, 0); callout_init(tp->tt_keep = &it->inp_tp_keep, 0); callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0); callout_init(tp->tt_delack = &it->inp_tp_delack, 0); if (tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (tcp_do_rfc1644) tp->t_flags |= TF_REQ_CC; tp->t_inpcb = inp; /* XXX */ /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; tp->t_bw_rtttime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = ip_defttl; inp->inp_ppcb = (caddr_t)tp; return (tp); /* XXX */ } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(tp, errno) register struct tcpcb *tp; int errno; { struct socket *so = tp->t_inpcb->inp_socket; if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; (void) tcp_output(tp); tcpstat.tcps_drops++; } else tcpstat.tcps_conndrops++; if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } /* * Close a TCP control block: * discard all space held by the tcp * discard internet protocol block * wake up any sleepers */ struct tcpcb * tcp_close(tp) register struct tcpcb *tp; { register struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ register struct rtentry *rt; int dosavessthresh; /* * Make sure that all of our timers are stopped before we * delete the PCB. */ callout_stop(tp->tt_rexmt); callout_stop(tp->tt_persist); callout_stop(tp->tt_keep); callout_stop(tp->tt_2msl); callout_stop(tp->tt_delack); /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as the 16 samples. * 16 samples is enough for the srtt filter to converge * to within 5% of the correct value; fewer samples and * we could save a very bogus rtt. * * Don't update the default route's characteristics and don't * update anything that the user "locked". */ if (tp->t_rttupdated >= 16) { register u_long i = 0; #ifdef INET6 if (isipv6) { struct sockaddr_in6 *sin6; if ((rt = inp->in6p_route.ro_rt) == NULL) goto no_valid_rt; sin6 = (struct sockaddr_in6 *)rt_key(rt); if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) goto no_valid_rt; } else #endif /* INET6 */ if ((rt = inp->inp_route.ro_rt) == NULL || ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr == INADDR_ANY) goto no_valid_rt; if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { i = tp->t_srtt * (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); if (rt->rt_rmx.rmx_rtt && i) /* * filter this update to half the old & half * the new values, converting scale. * See route.h and tcp_var.h for a * description of the scaling constants. */ rt->rt_rmx.rmx_rtt = (rt->rt_rmx.rmx_rtt + i) / 2; else rt->rt_rmx.rmx_rtt = i; tcpstat.tcps_cachedrtt++; } if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { i = tp->t_rttvar * (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); if (rt->rt_rmx.rmx_rttvar && i) rt->rt_rmx.rmx_rttvar = (rt->rt_rmx.rmx_rttvar + i) / 2; else rt->rt_rmx.rmx_rttvar = i; tcpstat.tcps_cachedrttvar++; } /* * The old comment here said: * update the pipelimit (ssthresh) if it has been updated * already or if a pipesize was specified & the threshhold * got below half the pipesize. I.e., wait for bad news * before we start updating, then update on both good * and bad news. * * But we want to save the ssthresh even if no pipesize is * specified explicitly in the route, because such * connections still have an implicit pipesize specified * by the global tcp_sendspace. In the absence of a reliable * way to calculate the pipesize, it will have to do. */ i = tp->snd_ssthresh; if (rt->rt_rmx.rmx_sendpipe != 0) dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); else dosavessthresh = (i < so->so_snd.sb_hiwat / 2); if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && i != 0 && rt->rt_rmx.rmx_ssthresh != 0) || dosavessthresh) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ i = (i + tp->t_maxseg / 2) / tp->t_maxseg; if (i < 2) i = 2; i *= (u_long)(tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); if (rt->rt_rmx.rmx_ssthresh) rt->rt_rmx.rmx_ssthresh = (rt->rt_rmx.rmx_ssthresh + i) / 2; else rt->rt_rmx.rmx_ssthresh = i; tcpstat.tcps_cachedssthresh++; } } no_valid_rt: /* free the reassembly queue, if any */ while((q = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); FREE(q, M_TSEGQ); } inp->inp_ppcb = NULL; soisdisconnected(so); #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) in6_pcbdetach(inp); else #endif /* INET6 */ in_pcbdetach(inp); tcpstat.tcps_closed++; return ((struct tcpcb *)0); } void tcp_drain() { if (do_tcpdrain) { struct inpcb *inpb; struct tcpcb *tcpb; struct tseg_qent *te; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * usefull. */ INP_INFO_RLOCK(&tcbinfo); LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { INP_LOCK(inpb); if ((tcpb = intotcpcb(inpb))) { while ((te = LIST_FIRST(&tcpb->t_segq)) != NULL) { LIST_REMOVE(te, tqe_q); m_freem(te->tqe_m); FREE(te, M_TSEGQ); } } INP_UNLOCK(inpb); } INP_INFO_RUNLOCK(&tcbinfo); } } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(inp, error) struct inpcb *inp; int error; { struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { return inp; } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tcp_drop(tp, error); return (struct inpcb *)0; } else { tp->t_softerror = error; return inp; } #if 0 wakeup((caddr_t) &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n, s; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = tcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xtcpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ s = splnet(); INP_INFO_RLOCK(&tcbinfo); gencnt = tcbinfo.ipi_gencnt; n = tcbinfo.ipi_count; INP_INFO_RUNLOCK(&tcbinfo); splx(s); sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xtcpcb)); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return ENOMEM; s = splnet(); INP_INFO_RLOCK(&tcbinfo); for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) inp_list[i++] = inp; INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&tcbinfo); splx(s); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; caddr_t inp_ppcb; xt.xt_len = sizeof xt; /* XXX should avoid extra copy */ bcopy(inp, &xt.xt_inp, sizeof *inp); inp_ppcb = inp->inp_ppcb; if (inp_ppcb != NULL) bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); else bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xt.xt_socket); error = SYSCTL_OUT(req, &xt, sizeof xt); } INP_UNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ s = splnet(); INP_INFO_RLOCK(&tcbinfo); xig.xig_gen = tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = tcbinfo.ipi_count; INP_INFO_RUNLOCK(&tcbinfo); splx(s); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error, s; error = suser_cred(req->td->td_ucred, PRISON_ROOT); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); s = splnet(); INP_INFO_RLOCK(&tcbinfo); inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); if (inp == NULL) { error = ENOENT; goto outunlocked; } INP_LOCK(inp); if (inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: INP_UNLOCK(inp); outunlocked: INP_INFO_RUNLOCK(&tcbinfo); splx(s); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error, s, mapped = 0; error = suser_cred(req->td->td_ucred, PRISON_ROOT); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else return (EINVAL); } s = splnet(); INP_INFO_RLOCK(&tcbinfo); if (mapped == 1) inp = in_pcblookup_hash(&tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, 0, NULL); else inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); if (inp == NULL) { error = ENOENT; goto outunlocked; } INP_LOCK(inp); if (inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: INP_UNLOCK(inp); outunlocked: INP_INFO_RUNLOCK(&tcbinfo); splx(s); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif void tcp_ctlinput(cmd, sa, vip) int cmd; struct sockaddr *sa; void *vip; { struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; tcp_seq icmp_seq; int s; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (cmd == PRC_QUENCH) notify = tcp_quench; else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; else if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; else if (PRC_IS_REDIRECT(cmd)) { ip = 0; notify = in_rtchange; } else if (cmd == PRC_HOSTDEAD) ip = 0; else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip) { s = splnet(); th = (struct tcphdr *)((caddr_t)ip + (IP_VHL_HL(ip->ip_vhl) << 2)); INP_INFO_WLOCK(&tcbinfo); inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, 0, NULL); if (inp != NULL) { INP_LOCK(inp); if (inp->inp_socket != NULL) { icmp_seq = htonl(th->th_seq); tp = intotcpcb(inp); if (SEQ_GEQ(icmp_seq, tp->snd_una) && SEQ_LT(icmp_seq, tp->snd_max)) inp = (*notify)(inp, inetctlerrmap[cmd]); } if (inp) INP_UNLOCK(inp); } else { struct in_conninfo inc; inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; #ifdef INET6 inc.inc_isipv6 = 0; #endif syncache_unreach(&inc, th); } INP_INFO_WUNLOCK(&tcbinfo); splx(s); } else in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); } #ifdef INET6 void tcp6_ctlinput(cmd, sa, d) int cmd; struct sockaddr *sa; void *d; { struct tcphdr th; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; int off; struct tcp_portonly { u_int16_t th_sport; u_int16_t th_dport; } *thp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if (cmd == PRC_QUENCH) notify = tcp_quench; else if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; else if (!PRC_IS_REDIRECT(cmd) && ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; } if (ip6) { struct in_conninfo inc; /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* check if we can safely examine src and dst ports */ if (m->m_pkthdr.len < off + sizeof(*thp)) return; bzero(&th, sizeof(th)); m_copydata(m, off, sizeof(*thp), (caddr_t)&th); in6_pcbnotify(&tcb, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, th.th_sport, cmd, notify); inc.inc_fport = th.th_dport; inc.inc_lport = th.th_sport; inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; inc.inc_isipv6 = 1; syncache_unreach(&inc, &th); } else in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, notify); } #endif /* INET6 */ /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used to generate sequence numbers. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * */ #define ISN_BYTES_PER_SECOND 1048576 u_char isn_secret[32]; int isn_last_reseed; MD5_CTX isn_ctx; tcp_seq tcp_new_isn(tp) struct tcpcb *tp; { u_int32_t md5_buffer[4]; tcp_seq new_isn; /* Seed if this is the first use, reseed if requested. */ if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) < (u_int)ticks))) { read_random(&isn_secret, sizeof(isn_secret)); isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ MD5Init(&isn_ctx); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, sizeof(struct in6_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, sizeof(struct in6_addr)); } else #endif { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, sizeof(struct in_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, sizeof(struct in_addr)); } MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); MD5Final((u_char *) &md5_buffer, &isn_ctx); new_isn = (tcp_seq) md5_buffer[0]; new_isn += ticks * (ISN_BYTES_PER_SECOND / hz); return new_isn; } /* * When a source quench is received, close congestion window * to one segment. We will gradually open it again as we proceed. */ struct inpcb * tcp_quench(inp, errno) struct inpcb *inp; int errno; { struct tcpcb *tp = intotcpcb(inp); if (tp) tp->snd_cwnd = tp->t_maxseg; return (inp); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ struct inpcb * tcp_drop_syn_sent(inp, errno) struct inpcb *inp; int errno; { struct tcpcb *tp = intotcpcb(inp); if (tp && tp->t_state == TCPS_SYN_SENT) { tcp_drop(tp, errno); return (struct inpcb *)0; } return inp; } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value in the route. Also nudge TCP to send something, * since we know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ struct inpcb * tcp_mtudisc(inp, errno) struct inpcb *inp; int errno; { struct tcpcb *tp = intotcpcb(inp); struct rtentry *rt; struct rmxp_tao *taop; struct socket *so = inp->inp_socket; int offered; int mss; #ifdef INET6 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ if (tp) { #ifdef INET6 if (isipv6) rt = tcp_rtlookup6(&inp->inp_inc); else #endif /* INET6 */ rt = tcp_rtlookup(&inp->inp_inc); if (!rt || !rt->rt_rmx.rmx_mtu) { tp->t_maxopd = tp->t_maxseg = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif /* INET6 */ tcp_mssdflt; return inp; } taop = rmx_taop(rt->rt_rmx); offered = taop->tao_mssopt; mss = rt->rt_rmx.rmx_mtu - #ifdef INET6 (isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : #endif /* INET6 */ sizeof(struct tcpiphdr) #ifdef INET6 ) #endif /* INET6 */ ; if (offered) mss = min(mss, offered); /* * XXX - The above conditional probably violates the TCP * spec. The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ if (tp->t_maxopd <= mss) return inp; tp->t_maxopd = mss; if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) mss -= TCPOLEN_TSTAMP_APPA; if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) mss -= TCPOLEN_CC_APPA; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif if (so->so_snd.sb_hiwat < mss) mss = so->so_snd.sb_hiwat; tp->t_maxseg = mss; tcpstat.tcps_mturesent++; tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_output(tp); } return inp; } /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated the return NULL. This routine * is called by TCP routines that access the rmx structure and by tcp_mss * to get the interface MTU. */ struct rtentry * tcp_rtlookup(inc) struct in_conninfo *inc; { struct route *ro; struct rtentry *rt; ro = &inc->inc_route; rt = ro->ro_rt; if (rt == NULL || !(rt->rt_flags & RTF_UP)) { /* No route yet, so try to acquire one */ if (inc->inc_faddr.s_addr != INADDR_ANY) { ro->ro_dst.sa_family = AF_INET; ro->ro_dst.sa_len = sizeof(struct sockaddr_in); ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = inc->inc_faddr; rtalloc(ro); rt = ro->ro_rt; } } return rt; } #ifdef INET6 struct rtentry * tcp_rtlookup6(inc) struct in_conninfo *inc; { struct route_in6 *ro6; struct rtentry *rt; ro6 = &inc->inc6_route; rt = ro6->ro_rt; if (rt == NULL || !(rt->rt_flags & RTF_UP)) { /* No route yet, so try to acquire one */ if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { ro6->ro_dst.sin6_family = AF_INET6; ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6); ro6->ro_dst.sin6_addr = inc->inc6_faddr; rtalloc((struct route *)ro6); rt = ro6->ro_rt; } } return rt; } #endif /* INET6 */ #ifdef IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t ipsec_hdrsiz_tcp(tp) struct tcpcb *tp; { struct inpcb *inp; struct mbuf *m; size_t hdrsiz; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif /* INET6 */ struct tcphdr *th; if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) return 0; MGETHDR(m, M_DONTWAIT, MT_DATA); if (!m) return 0; #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); tcp_fillheaders(tp, ip6, th); hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); tcp_fillheaders(tp, ip, th); hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } m_free(m); return hdrsiz; } #endif /*IPSEC*/ /* * Return a pointer to the cached information about the remote host. * The cached information is stored in the protocol specific part of * the route metrics. */ struct rmxp_tao * tcp_gettaocache(inc) struct in_conninfo *inc; { struct rtentry *rt; #ifdef INET6 if (inc->inc_isipv6) rt = tcp_rtlookup6(inc); else #endif /* INET6 */ rt = tcp_rtlookup(inc); /* Make sure this is a host route and is up. */ if (rt == NULL || (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) return NULL; return rmx_taop(rt->rt_rmx); } /* * Clear all the TAO cache entries, called from tcp_init. * * XXX * This routine is just an empty one, because we assume that the routing * routing tables are initialized at the same time when TCP, so there is * nothing in the cache left over. */ static void tcp_cleartaocache() { } /* * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING * * This code attempts to calculate the bandwidth-delay product as a * means of determining the optimal window size to maximize bandwidth, * minimize RTT, and avoid the over-allocation of buffers on interfaces and * routers. This code also does a fairly good job keeping RTTs in check * across slow links like modems. We implement an algorithm which is very * similar (but not meant to be) TCP/Vegas. The code operates on the * transmitter side of a TCP connection and so only effects the transmit * side of the connection. * * BACKGROUND: TCP makes no provision for the management of buffer space * at the end points or at the intermediate routers and switches. A TCP * stream, whether using NewReno or not, will eventually buffer as * many packets as it is able and the only reason this typically works is * due to the fairly small default buffers made available for a connection * (typicaly 16K or 32K). As machines use larger windows and/or window * scaling it is now fairly easy for even a single TCP connection to blow-out * all available buffer space not only on the local interface, but on * intermediate routers and switches as well. NewReno makes a misguided * attempt to 'solve' this problem by waiting for an actual failure to occur, * then backing off, then steadily increasing the window again until another * failure occurs, ad-infinitum. This results in terrible oscillation that * is only made worse as network loads increase and the idea of intentionally * blowing out network buffers is, frankly, a terrible way to manage network * resources. * * It is far better to limit the transmit window prior to the failure * condition being achieved. There are two general ways to do this: First * you can 'scan' through different transmit window sizes and locate the * point where the RTT stops increasing, indicating that you have filled the * pipe, then scan backwards until you note that RTT stops decreasing, then * repeat ad-infinitum. This method works in principle but has severe * implementation issues due to RTT variances, timer granularity, and * instability in the algorithm which can lead to many false positives and * create oscillations as well as interact badly with other TCP streams * implementing the same algorithm. * * The second method is to limit the window to the bandwidth delay product * of the link. This is the method we implement. RTT variances and our * own manipulation of the congestion window, bwnd, can potentially * destabilize the algorithm. For this reason we have to stabilize the * elements used to calculate the window. We do this by using the minimum * observed RTT, the long term average of the observed bandwidth, and * by adding two segments worth of slop. It isn't perfect but it is able * to react to changing conditions and gives us a very stable basis on * which to extend the algorithm. */ void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) { u_long bw; u_long bwnd; int save_ticks; /* * If inflight_enable is disabled in the middle of a tcp connection, * make sure snd_bwnd is effectively disabled. */ if (tcp_inflight_enable == 0) { tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bandwidth = 0; return; } /* * Figure out the bandwidth. Due to the tick granularity this * is a very rough number and it MUST be averaged over a fairly * long period of time. XXX we need to take into account a link * that is not using all available bandwidth, but for now our * slop will ramp us up if this case occurs and the bandwidth later * increases. * * Note: if ticks rollover 'bw' may wind up negative. We must * effectively reset t_bw_rtttime for this case. */ save_ticks = ticks; if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) return; bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / (save_ticks - tp->t_bw_rtttime); tp->t_bw_rtttime = save_ticks; tp->t_bw_rtseq = ack_seq; if (tp->t_bw_rtttime == 0 || (int)bw < 0) return; bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; tp->snd_bandwidth = bw; /* * Calculate the semi-static bandwidth delay product, plus two maximal * segments. The additional slop puts us squarely in the sweet * spot and also handles the bandwidth run-up case. Without the * slop we could be locking ourselves into a lower bandwidth. * * Situations Handled: * (1) Prevents over-queueing of packets on LANs, especially on * high speed LANs, allowing larger TCP buffers to be * specified, and also does a good job preventing * over-queueing of packets over choke points like modems * (at least for the transmit side). * * (2) Is able to handle changing network loads (bandwidth * drops so bwnd drops, bandwidth increases so bwnd * increases). * * (3) Theoretically should stabilize in the face of multiple * connections implementing the same algorithm (this may need * a little work). */ #define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + 2 * tp->t_maxseg; #undef USERTT if (tcp_inflight_debug > 0) { static int ltime; if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { ltime = ticks; printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", tp, bw, tp->t_rttbest, tp->t_srtt, bwnd ); } } if ((long)bwnd < tcp_inflight_min) bwnd = tcp_inflight_min; if (bwnd > tcp_inflight_max) bwnd = tcp_inflight_max; if ((long)bwnd < tp->t_maxseg * 2) bwnd = tp->t_maxseg * 2; tp->snd_bwnd = bwnd; } Index: head/sys/netinet/tcp_syncache.c =================================================================== --- head/sys/netinet/tcp_syncache.c (revision 105193) +++ head/sys/netinet/tcp_syncache.c (revision 105194) @@ -1,1383 +1,1377 @@ /*- * Copyright (c) 2001 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jonathan Lemon * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #endif #include #include #include #include #include #ifdef INET6 #include #endif #ifdef IPSEC #include #ifdef INET6 #include #endif #include #endif /*IPSEC*/ #include #include static int tcp_syncookies = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW, &tcp_syncookies, 0, "Use TCP SYN cookies if the syncache overflows"); static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **); static int syncache_respond(struct syncache *, struct mbuf *); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); static void syncache_timer(void *); static u_int32_t syncookie_generate(struct syncache *); static struct syncache *syncookie_lookup(struct in_conninfo *, struct tcphdr *, struct socket *); /* * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. * 3 retransmits corresponds to a timeout of (1 + 2 + 4 + 8 == 15) seconds, * the odds are that the user has given up attempting to connect by then. */ #define SYNCACHE_MAXREXMTS 3 /* Arbitrary values */ #define TCP_SYNCACHE_HASHSIZE 512 #define TCP_SYNCACHE_BUCKETLIMIT 30 struct tcp_syncache { struct syncache_head *hashbase; uma_zone_t zone; u_int hashsize; u_int hashmask; u_int bucket_limit; u_int cache_count; u_int cache_limit; u_int rexmt_limit; u_int hash_secret; u_int next_reseed; TAILQ_HEAD(, syncache) timerq[SYNCACHE_MAXREXMTS + 1]; struct callout tt_timerq[SYNCACHE_MAXREXMTS + 1]; }; static struct tcp_syncache tcp_syncache; SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RD, &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache"); SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RD, &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache"); SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD, &tcp_syncache.cache_count, 0, "Current number of entries in syncache"); SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RD, &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable"); SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW, &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions"); static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); #define SYNCACHE_HASH(inc, mask) \ ((tcp_syncache.hash_secret ^ \ (inc)->inc_faddr.s_addr ^ \ ((inc)->inc_faddr.s_addr >> 16) ^ \ (inc)->inc_fport ^ (inc)->inc_lport) & mask) #define SYNCACHE_HASH6(inc, mask) \ ((tcp_syncache.hash_secret ^ \ (inc)->inc6_faddr.s6_addr32[0] ^ \ (inc)->inc6_faddr.s6_addr32[3] ^ \ (inc)->inc_fport ^ (inc)->inc_lport) & mask) #define ENDPTS_EQ(a, b) ( \ (a)->ie_fport == (b)->ie_fport && \ (a)->ie_lport == (b)->ie_lport && \ (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \ (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \ ) #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0) #define SYNCACHE_TIMEOUT(sc, slot) do { \ sc->sc_rxtslot = slot; \ sc->sc_rxttime = ticks + TCPTV_RTOBASE * tcp_backoff[slot]; \ TAILQ_INSERT_TAIL(&tcp_syncache.timerq[slot], sc, sc_timerq); \ if (!callout_active(&tcp_syncache.tt_timerq[slot])) \ callout_reset(&tcp_syncache.tt_timerq[slot], \ TCPTV_RTOBASE * tcp_backoff[slot], \ syncache_timer, (void *)((intptr_t)slot)); \ } while (0) static void syncache_free(struct syncache *sc) { struct rtentry *rt; if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); #ifdef INET6 if (sc->sc_inc.inc_isipv6) rt = sc->sc_route6.ro_rt; else #endif rt = sc->sc_route.ro_rt; if (rt != NULL) { /* * If this is the only reference to a protocol cloned * route, remove it immediately. */ if (rt->rt_flags & RTF_WASCLONED && (sc->sc_flags & SCF_KEEPROUTE) == 0 && rt->rt_refcnt == 1) rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, NULL); RTFREE(rt); } uma_zfree(tcp_syncache.zone, sc); } void syncache_init(void) { int i; tcp_syncache.cache_count = 0; tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; tcp_syncache.cache_limit = tcp_syncache.hashsize * tcp_syncache.bucket_limit; tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; tcp_syncache.next_reseed = 0; tcp_syncache.hash_secret = arc4random(); TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", &tcp_syncache.hashsize); TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", &tcp_syncache.cache_limit); TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", &tcp_syncache.bucket_limit); if (!powerof2(tcp_syncache.hashsize)) { printf("WARNING: syncache hash size is not a power of 2.\n"); tcp_syncache.hashsize = 512; /* safe default */ } tcp_syncache.hashmask = tcp_syncache.hashsize - 1; /* Allocate the hash table. */ MALLOC(tcp_syncache.hashbase, struct syncache_head *, tcp_syncache.hashsize * sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK); /* Initialize the hash buckets. */ for (i = 0; i < tcp_syncache.hashsize; i++) { TAILQ_INIT(&tcp_syncache.hashbase[i].sch_bucket); tcp_syncache.hashbase[i].sch_length = 0; } /* Initialize the timer queues. */ for (i = 0; i <= SYNCACHE_MAXREXMTS; i++) { TAILQ_INIT(&tcp_syncache.timerq[i]); callout_init(&tcp_syncache.tt_timerq[i], 0); } /* * Allocate the syncache entries. Allow the zone to allocate one * more entry than cache limit, so a new entry can bump out an * older one. */ tcp_syncache.cache_limit -= 1; tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcp_syncache.zone, tcp_syncache.cache_limit); } static void syncache_insert(sc, sch) struct syncache *sc; struct syncache_head *sch; { struct syncache *sc2; int s, i; /* * Make sure that we don't overflow the per-bucket * limit or the total cache size limit. */ s = splnet(); if (sch->sch_length >= tcp_syncache.bucket_limit) { /* * The bucket is full, toss the oldest element. */ sc2 = TAILQ_FIRST(&sch->sch_bucket); sc2->sc_tp->ts_recent = ticks; syncache_drop(sc2, sch); tcpstat.tcps_sc_bucketoverflow++; } else if (tcp_syncache.cache_count >= tcp_syncache.cache_limit) { /* * The cache is full. Toss the oldest entry in the * entire cache. This is the front entry in the * first non-empty timer queue with the largest * timeout value. */ for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) { sc2 = TAILQ_FIRST(&tcp_syncache.timerq[i]); if (sc2 != NULL) break; } sc2->sc_tp->ts_recent = ticks; syncache_drop(sc2, NULL); tcpstat.tcps_sc_cacheoverflow++; } /* Initialize the entry's timer. */ SYNCACHE_TIMEOUT(sc, 0); /* Put it into the bucket. */ TAILQ_INSERT_TAIL(&sch->sch_bucket, sc, sc_hash); sch->sch_length++; tcp_syncache.cache_count++; tcpstat.tcps_sc_added++; splx(s); } static void syncache_drop(sc, sch) struct syncache *sc; struct syncache_head *sch; { int s; if (sch == NULL) { #ifdef INET6 if (sc->sc_inc.inc_isipv6) { sch = &tcp_syncache.hashbase[ SYNCACHE_HASH6(&sc->sc_inc, tcp_syncache.hashmask)]; } else #endif { sch = &tcp_syncache.hashbase[ SYNCACHE_HASH(&sc->sc_inc, tcp_syncache.hashmask)]; } } s = splnet(); TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; tcp_syncache.cache_count--; TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot], sc, sc_timerq); if (TAILQ_EMPTY(&tcp_syncache.timerq[sc->sc_rxtslot])) callout_stop(&tcp_syncache.tt_timerq[sc->sc_rxtslot]); splx(s); syncache_free(sc); } /* * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. * If we have retransmitted an entry the maximum number of times, expire it. */ static void syncache_timer(xslot) void *xslot; { intptr_t slot = (intptr_t)xslot; struct syncache *sc, *nsc; struct inpcb *inp; int s; s = splnet(); if (callout_pending(&tcp_syncache.tt_timerq[slot]) || !callout_active(&tcp_syncache.tt_timerq[slot])) { splx(s); return; } callout_deactivate(&tcp_syncache.tt_timerq[slot]); nsc = TAILQ_FIRST(&tcp_syncache.timerq[slot]); INP_INFO_RLOCK(&tcbinfo); while (nsc != NULL) { if (ticks < nsc->sc_rxttime) break; sc = nsc; inp = sc->sc_tp->t_inpcb; INP_LOCK(inp); if (slot == SYNCACHE_MAXREXMTS || slot >= tcp_syncache.rexmt_limit || inp->inp_gencnt != sc->sc_inp_gencnt) { nsc = TAILQ_NEXT(sc, sc_timerq); syncache_drop(sc, NULL); tcpstat.tcps_sc_stale++; INP_UNLOCK(inp); continue; } /* * syncache_respond() may call back into the syncache to * to modify another entry, so do not obtain the next * entry on the timer chain until it has completed. */ (void) syncache_respond(sc, NULL); INP_UNLOCK(inp); nsc = TAILQ_NEXT(sc, sc_timerq); tcpstat.tcps_sc_retransmitted++; TAILQ_REMOVE(&tcp_syncache.timerq[slot], sc, sc_timerq); SYNCACHE_TIMEOUT(sc, slot + 1); } INP_INFO_RUNLOCK(&tcbinfo); if (nsc != NULL) callout_reset(&tcp_syncache.tt_timerq[slot], nsc->sc_rxttime - ticks, syncache_timer, (void *)(slot)); splx(s); } /* * Find an entry in the syncache. */ struct syncache * syncache_lookup(inc, schp) struct in_conninfo *inc; struct syncache_head **schp; { struct syncache *sc; struct syncache_head *sch; int s; #ifdef INET6 if (inc->inc_isipv6) { sch = &tcp_syncache.hashbase[ SYNCACHE_HASH6(inc, tcp_syncache.hashmask)]; *schp = sch; s = splnet(); TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) { splx(s); return (sc); } } splx(s); } else #endif { sch = &tcp_syncache.hashbase[ SYNCACHE_HASH(inc, tcp_syncache.hashmask)]; *schp = sch; s = splnet(); TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { #ifdef INET6 if (sc->sc_inc.inc_isipv6) continue; #endif if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) { splx(s); return (sc); } } splx(s); } return (NULL); } /* * This function is called when we get a RST for a * non-existent connection, so that we can see if the * connection is in the syn cache. If it is, zap it. */ void syncache_chkrst(inc, th) struct in_conninfo *inc; struct tcphdr *th; { struct syncache *sc; struct syncache_head *sch; sc = syncache_lookup(inc, &sch); if (sc == NULL) return; /* * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. */ if (SEQ_GEQ(th->th_seq, sc->sc_irs) && SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) { syncache_drop(sc, sch); tcpstat.tcps_sc_reset++; } } void syncache_badack(inc) struct in_conninfo *inc; { struct syncache *sc; struct syncache_head *sch; sc = syncache_lookup(inc, &sch); if (sc != NULL) { syncache_drop(sc, sch); tcpstat.tcps_sc_badack++; } } void syncache_unreach(inc, th) struct in_conninfo *inc; struct tcphdr *th; { struct syncache *sc; struct syncache_head *sch; /* we are called at splnet() here */ sc = syncache_lookup(inc, &sch); if (sc == NULL) return; /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ if (ntohl(th->th_seq) != sc->sc_iss) return; /* * If we've rertransmitted 3 times and this is our second error, * we remove the entry. Otherwise, we allow it to continue on. * This prevents us from incorrectly nuking an entry during a * spurious network outage. * * See tcp_notify(). */ if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtslot < 3) { sc->sc_flags |= SCF_UNREACH; return; } syncache_drop(sc, sch); tcpstat.tcps_sc_unreach++; } /* * Build a new TCP socket structure from a syncache entry. */ static struct socket * syncache_socket(sc, lso, m) struct syncache *sc; struct socket *lso; struct mbuf *m; { struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; /* * Ok, create the full blown connection, and set things up * as they would have been set up if we had created the * connection when the SYN arrived. If we can't create * the connection, abort it. */ so = sonewconn(lso, SS_ISCONNECTED); if (so == NULL) { /* * Drop the connection; we will send a RST if the peer * retransmits the ACK, */ tcpstat.tcps_listendrop++; goto abort; } #ifdef MAC mac_set_socket_peer_from_mbuf(m, so); #endif inp = sotoinpcb(so); /* * Insert new socket into hash list. */ inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6; #ifdef INET6 if (sc->sc_inc.inc_isipv6) { inp->in6p_laddr = sc->sc_inc.inc6_laddr; } else { inp->inp_vflag &= ~INP_IPV6; inp->inp_vflag |= INP_IPV4; #endif inp->inp_laddr = sc->sc_inc.inc_laddr; #ifdef INET6 } #endif inp->inp_lport = sc->sc_inc.inc_lport; if (in_pcbinshash(inp) != 0) { /* * Undo the assignments above if we failed to * put the PCB on the hash lists. */ #ifdef INET6 if (sc->sc_inc.inc_isipv6) inp->in6p_laddr = in6addr_any; else #endif inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; goto abort; } #ifdef IPSEC /* copy old policy into new socket's */ if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp)) printf("syncache_expand: could not copy policy\n"); #endif #ifdef INET6 if (sc->sc_inc.inc_isipv6) { struct inpcb *oinp = sotoinpcb(lso); struct in6_addr laddr6; struct sockaddr_in6 *sin6; /* * Inherit socket options from the listening socket. * Note that in6p_inputopts are not (and should not be) * copied, since it stores previously received options and is * used to detect if each new option is different than the * previous one and hence should be passed to a user. * If we copied in6p_inputopts, a user would not be able to * receive options just after calling the accept system call. */ inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS; if (oinp->in6p_outputopts) inp->in6p_outputopts = ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); inp->in6p_route = sc->sc_route6; sc->sc_route6.ro_rt = NULL; MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, M_SONAME, M_NOWAIT | M_ZERO); if (sin6 == NULL) goto abort; sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_addr = sc->sc_inc.inc6_faddr; sin6->sin6_port = sc->sc_inc.inc_fport; laddr6 = inp->in6p_laddr; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = sc->sc_inc.inc6_laddr; if (in6_pcbconnect(inp, (struct sockaddr *)sin6, &thread0)) { inp->in6p_laddr = laddr6; FREE(sin6, M_SONAME); goto abort; } FREE(sin6, M_SONAME); } else #endif { struct in_addr laddr; struct sockaddr_in *sin; inp->inp_options = ip_srcroute(); if (inp->inp_options == NULL) { inp->inp_options = sc->sc_ipopts; sc->sc_ipopts = NULL; } inp->inp_route = sc->sc_route; sc->sc_route.ro_rt = NULL; MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_NOWAIT | M_ZERO); if (sin == NULL) goto abort; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = sc->sc_inc.inc_faddr; sin->sin_port = sc->sc_inc.inc_fport; bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = sc->sc_inc.inc_laddr; if (in_pcbconnect(inp, (struct sockaddr *)sin, &thread0)) { inp->inp_laddr = laddr; FREE(sin, M_SONAME); goto abort; } FREE(sin, M_SONAME); } tp = intotcpcb(inp); tp->t_state = TCPS_SYN_RECEIVED; tp->iss = sc->sc_iss; tp->irs = sc->sc_irs; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); tp->snd_wl1 = sc->sc_irs; tp->rcv_up = sc->sc_irs + 1; tp->rcv_wnd = sc->sc_wnd; tp->rcv_adv += tp->rcv_wnd; tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); if (sc->sc_flags & SCF_NOOPT) tp->t_flags |= TF_NOOPT; if (sc->sc_flags & SCF_WINSCALE) { tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; tp->requested_s_scale = sc->sc_requested_s_scale; tp->request_r_scale = sc->sc_request_r_scale; } if (sc->sc_flags & SCF_TIMESTAMP) { tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; tp->ts_recent = sc->sc_tsrecent; tp->ts_recent_age = ticks; } if (sc->sc_flags & SCF_CC) { /* * Initialization of the tcpcb for transaction; * set SND.WND = SEG.WND, * initialize CCsend and CCrecv. */ tp->t_flags |= TF_REQ_CC|TF_RCVD_CC; tp->cc_send = sc->sc_cc_send; tp->cc_recv = sc->sc_cc_recv; } tcp_mss(tp, sc->sc_peer_mss); /* * If the SYN,ACK was retransmitted, reset cwnd to 1 segment. */ if (sc->sc_rxtslot != 0) tp->snd_cwnd = tp->t_maxseg; callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); tcpstat.tcps_accepts++; return (so); abort: if (so != NULL) (void) soabort(so); return (NULL); } /* * This function gets called when we receive an ACK for a * socket in the LISTEN state. We look up the connection * in the syncache, and if its there, we pull it out of * the cache and turn it into a full-blown connection in * the SYN-RECEIVED state. */ int syncache_expand(inc, th, sop, m) struct in_conninfo *inc; struct tcphdr *th; struct socket **sop; struct mbuf *m; { struct syncache *sc; struct syncache_head *sch; struct socket *so; sc = syncache_lookup(inc, &sch); if (sc == NULL) { /* * There is no syncache entry, so see if this ACK is * a returning syncookie. To do this, first: * A. See if this socket has had a syncache entry dropped in * the past. We don't want to accept a bogus syncookie * if we've never received a SYN. * B. check that the syncookie is valid. If it is, then * cobble up a fake syncache entry, and return. */ if (!tcp_syncookies) return (0); sc = syncookie_lookup(inc, th, *sop); if (sc == NULL) return (0); sch = NULL; tcpstat.tcps_sc_recvcookie++; } /* * If seg contains an ACK, but not for our SYN/ACK, send a RST. */ if (th->th_ack != sc->sc_iss + 1) return (0); so = syncache_socket(sc, *sop, m); if (so == NULL) { #if 0 resetandabort: /* XXXjlemon check this - is this correct? */ (void) tcp_respond(NULL, m, m, th, th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK); #endif m_freem(m); /* XXX only needed for above */ tcpstat.tcps_sc_aborted++; } else { sc->sc_flags |= SCF_KEEPROUTE; tcpstat.tcps_sc_completed++; } if (sch == NULL) syncache_free(sc); else syncache_drop(sc, sch); *sop = so; return (1); } /* * Given a LISTEN socket and an inbound SYN request, add * this to the syn cache, and send back a segment: * * to the source. * * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. * Doing so would require that we hold onto the data and deliver it * to the application. However, if we are the target of a SYN-flood * DoS attack, an attacker could send data which would eventually * consume all available buffer space if it were ACKed. By not ACKing * the data, we avoid this DoS scenario. */ int syncache_add(inc, to, th, sop, m) struct in_conninfo *inc; struct tcpopt *to; struct tcphdr *th; struct socket **sop; struct mbuf *m; { struct tcpcb *tp; struct socket *so; struct syncache *sc = NULL; struct syncache_head *sch; struct mbuf *ipopts = NULL; struct rmxp_tao *taop; int i, s, win; so = *sop; tp = sototcpcb(so); /* * Remember the IP options, if any. */ #ifdef INET6 if (!inc->inc_isipv6) #endif ipopts = ip_srcroute(); /* * See if we already have an entry for this connection. * If we do, resend the SYN,ACK, and reset the retransmit timer. * * XXX * should the syncache be re-initialized with the contents * of the new SYN here (which may have different options?) */ sc = syncache_lookup(inc, &sch); if (sc != NULL) { tcpstat.tcps_sc_dupsyn++; if (ipopts) { /* * If we were remembering a previous source route, * forget it and use the new one we've been given. */ if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); sc->sc_ipopts = ipopts; } /* * Update timestamp if present. */ if (sc->sc_flags & SCF_TIMESTAMP) sc->sc_tsrecent = to->to_tsval; /* * PCB may have changed, pick up new values. */ sc->sc_tp = tp; sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt; if (syncache_respond(sc, m) == 0) { s = splnet(); TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot], sc, sc_timerq); SYNCACHE_TIMEOUT(sc, sc->sc_rxtslot); splx(s); tcpstat.tcps_sndacks++; tcpstat.tcps_sndtotal++; } *sop = NULL; return (1); } sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT); if (sc == NULL) { /* * The zone allocator couldn't provide more entries. * Treat this as if the cache was full; drop the oldest * entry and insert the new one. */ s = splnet(); for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) { sc = TAILQ_FIRST(&tcp_syncache.timerq[i]); if (sc != NULL) break; } sc->sc_tp->ts_recent = ticks; syncache_drop(sc, NULL); splx(s); tcpstat.tcps_sc_zonefail++; sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT); if (sc == NULL) { if (ipopts) (void) m_free(ipopts); return (0); } } /* * Fill in the syncache values. */ bzero(sc, sizeof(*sc)); sc->sc_tp = tp; sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt; sc->sc_ipopts = ipopts; sc->sc_inc.inc_fport = inc->inc_fport; sc->sc_inc.inc_lport = inc->inc_lport; #ifdef INET6 sc->sc_inc.inc_isipv6 = inc->inc_isipv6; if (inc->inc_isipv6) { sc->sc_inc.inc6_faddr = inc->inc6_faddr; sc->sc_inc.inc6_laddr = inc->inc6_laddr; sc->sc_route6.ro_rt = NULL; } else #endif { sc->sc_inc.inc_faddr = inc->inc_faddr; sc->sc_inc.inc_laddr = inc->inc_laddr; sc->sc_route.ro_rt = NULL; } sc->sc_irs = th->th_seq; if (tcp_syncookies) sc->sc_iss = syncookie_generate(sc); else sc->sc_iss = arc4random(); /* Initial receive window: clip sbspace to [0 .. TCP_MAXWIN] */ win = sbspace(&so->so_rcv); win = imax(win, 0); win = imin(win, TCP_MAXWIN); sc->sc_wnd = win; sc->sc_flags = 0; sc->sc_peer_mss = to->to_flags & TOF_MSS ? to->to_mss : 0; if (tcp_do_rfc1323) { /* * A timestamp received in a SYN makes * it ok to send timestamp requests and replies. */ if (to->to_flags & TOF_TS) { sc->sc_tsrecent = to->to_tsval; sc->sc_flags |= SCF_TIMESTAMP; } if (to->to_flags & TOF_SCALE) { int wscale = 0; /* Compute proper scaling value from buffer space */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < so->so_rcv.sb_hiwat) wscale++; sc->sc_request_r_scale = wscale; sc->sc_requested_s_scale = to->to_requested_s_scale; sc->sc_flags |= SCF_WINSCALE; } } if (tcp_do_rfc1644) { /* * A CC or CC.new option received in a SYN makes * it ok to send CC in subsequent segments. */ if (to->to_flags & (TOF_CC|TOF_CCNEW)) { sc->sc_cc_recv = to->to_cc; sc->sc_cc_send = CC_INC(tcp_ccgen); sc->sc_flags |= SCF_CC; } } if (tp->t_flags & TF_NOOPT) sc->sc_flags = SCF_NOOPT; /* * XXX * We have the option here of not doing TAO (even if the segment * qualifies) and instead fall back to a normal 3WHS via the syncache. * This allows us to apply synflood protection to TAO-qualifying SYNs * also. However, there should be a hueristic to determine when to * do this, and is not present at the moment. */ /* * Perform TAO test on incoming CC (SEG.CC) option, if any. * - compare SEG.CC against cached CC from the same host, if any. * - if SEG.CC > chached value, SYN must be new and is accepted * immediately: save new CC in the cache, mark the socket * connected, enter ESTABLISHED state, turn on flag to * send a SYN in the next segment. * A virtual advertised window is set in rcv_adv to * initialize SWS prevention. Then enter normal segment * processing: drop SYN, process data and FIN. * - otherwise do a normal 3-way handshake. */ taop = tcp_gettaocache(&sc->sc_inc); if ((to->to_flags & TOF_CC) != 0) { if (((tp->t_flags & TF_NOPUSH) != 0) && sc->sc_flags & SCF_CC && taop != NULL && taop->tao_cc != 0 && CC_GT(to->to_cc, taop->tao_cc)) { sc->sc_rxtslot = 0; so = syncache_socket(sc, *sop, m); if (so != NULL) { sc->sc_flags |= SCF_KEEPROUTE; taop->tao_cc = to->to_cc; *sop = so; } syncache_free(sc); return (so != NULL); } } else { /* * No CC option, but maybe CC.NEW: invalidate cached value. */ if (taop != NULL) taop->tao_cc = 0; } /* * TAO test failed or there was no CC option, * do a standard 3-way handshake. */ if (syncache_respond(sc, m) == 0) { syncache_insert(sc, sch); tcpstat.tcps_sndacks++; tcpstat.tcps_sndtotal++; } else { syncache_free(sc); tcpstat.tcps_sc_dropped++; } *sop = NULL; return (1); } static int syncache_respond(sc, m) struct syncache *sc; struct mbuf *m; { u_int8_t *optp; int optlen, error; u_int16_t tlen, hlen, mssopt; struct ip *ip = NULL; struct rtentry *rt; struct tcphdr *th; #ifdef INET6 struct ip6_hdr *ip6 = NULL; #endif #ifdef INET6 if (sc->sc_inc.inc_isipv6) { rt = tcp_rtlookup6(&sc->sc_inc); if (rt != NULL) mssopt = rt->rt_ifp->if_mtu - (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)); else mssopt = tcp_v6mssdflt; hlen = sizeof(struct ip6_hdr); } else #endif { rt = tcp_rtlookup(&sc->sc_inc); if (rt != NULL) mssopt = rt->rt_ifp->if_mtu - (sizeof(struct ip) + sizeof(struct tcphdr)); else mssopt = tcp_mssdflt; hlen = sizeof(struct ip); } /* Compute the size of the TCP options. */ if (sc->sc_flags & SCF_NOOPT) { optlen = 0; } else { optlen = TCPOLEN_MAXSEG + ((sc->sc_flags & SCF_WINSCALE) ? 4 : 0) + ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0) + ((sc->sc_flags & SCF_CC) ? TCPOLEN_CC_APPA * 2 : 0); } tlen = hlen + sizeof(struct tcphdr) + optlen; /* * XXX * assume that the entire packet will fit in a header mbuf */ KASSERT(max_linkhdr + tlen <= MHLEN, ("syncache: mbuf too small")); /* * XXX shouldn't this reuse the mbuf if possible ? * Create the IP+TCP header from scratch. */ if (m) m_freem(m); m = m_gethdr(M_DONTWAIT, MT_HEADER); if (m == NULL) return (ENOBUFS); m->m_data += max_linkhdr; m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC mac_create_mbuf_from_socket(sc->sc_tp->t_inpcb->inp_socket, m); #endif -#ifdef IPSEC - /* use IPsec policy on listening socket to send SYN,ACK */ - if (ipsec_setsocket(m, sc->sc_tp->t_inpcb->inp_socket) != 0) { - m_freem(m); - return (ENOBUFS); - } -#endif - #ifdef INET6 if (sc->sc_inc.inc_isipv6) { ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_src = sc->sc_inc.inc6_laddr; ip6->ip6_dst = sc->sc_inc.inc6_faddr; ip6->ip6_plen = htons(tlen - hlen); /* ip6_hlim is set after checksum */ /* ip6_flow = ??? */ th = (struct tcphdr *)(ip6 + 1); } else #endif { ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_len = tlen; ip->ip_id = 0; ip->ip_off = 0; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = sc->sc_inc.inc_laddr; ip->ip_dst = sc->sc_inc.inc_faddr; ip->ip_ttl = sc->sc_tp->t_inpcb->inp_ip_ttl; /* XXX */ ip->ip_tos = sc->sc_tp->t_inpcb->inp_ip_tos; /* XXX */ /* * See if we should do MTU discovery. Route lookups are expensive, * so we will only unset the DF bit if: * * 1) path_mtu_discovery is disabled * 2) the SCF_UNREACH flag has been set */ if (path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) { ip->ip_off |= IP_DF; } th = (struct tcphdr *)(ip + 1); } th->th_sport = sc->sc_inc.inc_lport; th->th_dport = sc->sc_inc.inc_fport; th->th_seq = htonl(sc->sc_iss); th->th_ack = htonl(sc->sc_irs + 1); th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; th->th_x2 = 0; th->th_flags = TH_SYN|TH_ACK; th->th_win = htons(sc->sc_wnd); th->th_urp = 0; /* Tack on the TCP options. */ if (optlen == 0) goto no_options; optp = (u_int8_t *)(th + 1); *optp++ = TCPOPT_MAXSEG; *optp++ = TCPOLEN_MAXSEG; *optp++ = (mssopt >> 8) & 0xff; *optp++ = mssopt & 0xff; if (sc->sc_flags & SCF_WINSCALE) { *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | sc->sc_request_r_scale); optp += 4; } if (sc->sc_flags & SCF_TIMESTAMP) { u_int32_t *lp = (u_int32_t *)(optp); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); *lp++ = htonl(ticks); *lp = htonl(sc->sc_tsrecent); optp += TCPOLEN_TSTAMP_APPA; } /* * Send CC and CC.echo if we received CC from our peer. */ if (sc->sc_flags & SCF_CC) { u_int32_t *lp = (u_int32_t *)(optp); *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC)); *lp++ = htonl(sc->sc_cc_send); *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CCECHO)); *lp = htonl(sc->sc_cc_recv); optp += TCPOLEN_CC_APPA * 2; } no_options: #ifdef INET6 if (sc->sc_inc.inc_isipv6) { struct route_in6 *ro6 = &sc->sc_route6; th->th_sum = 0; th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); ip6->ip6_hlim = in6_selecthlim(NULL, ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL); - error = ip6_output(m, NULL, ro6, 0, NULL, NULL); + error = ip6_output(m, NULL, ro6, 0, NULL, NULL, + sc->sc_tp->t_inpcb); } else #endif { th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(tlen - hlen + IPPROTO_TCP)); m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); - error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 0, NULL); + error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 0, NULL, + sc->sc_tp->t_inpcb); } return (error); } /* * cookie layers: * * |. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .| * | peer iss | * | MD5(laddr,faddr,lport,fport,secret) |. . . . . . .| * | 0 |(A)| | * (A): peer mss index */ /* * The values below are chosen to minimize the size of the tcp_secret * table, as well as providing roughly a 4 second lifetime for the cookie. */ #define SYNCOOKIE_HASHSHIFT 2 /* log2(# of 32bit words from hash) */ #define SYNCOOKIE_WNDBITS 7 /* exposed bits for window indexing */ #define SYNCOOKIE_TIMESHIFT 5 /* scale ticks to window time units */ #define SYNCOOKIE_HASHMASK ((1 << SYNCOOKIE_HASHSHIFT) - 1) #define SYNCOOKIE_WNDMASK ((1 << SYNCOOKIE_WNDBITS) - 1) #define SYNCOOKIE_NSECRETS (1 << (SYNCOOKIE_WNDBITS - SYNCOOKIE_HASHSHIFT)) #define SYNCOOKIE_TIMEOUT \ (hz * (1 << SYNCOOKIE_WNDBITS) / (1 << SYNCOOKIE_TIMESHIFT)) #define SYNCOOKIE_DATAMASK ((3 << SYNCOOKIE_WNDBITS) | SYNCOOKIE_WNDMASK) static struct { u_int32_t ts_secbits; u_int ts_expire; } tcp_secret[SYNCOOKIE_NSECRETS]; static int tcp_msstab[] = { 0, 536, 1460, 8960 }; static MD5_CTX syn_ctx; #define MD5Add(v) MD5Update(&syn_ctx, (u_char *)&v, sizeof(v)) /* * Consider the problem of a recreated (and retransmitted) cookie. If the * original SYN was accepted, the connection is established. The second * SYN is inflight, and if it arrives with an ISN that falls within the * receive window, the connection is killed. * * However, since cookies have other problems, this may not be worth * worrying about. */ static u_int32_t syncookie_generate(struct syncache *sc) { u_int32_t md5_buffer[4]; u_int32_t data; int wnd, idx; wnd = ((ticks << SYNCOOKIE_TIMESHIFT) / hz) & SYNCOOKIE_WNDMASK; idx = wnd >> SYNCOOKIE_HASHSHIFT; if (tcp_secret[idx].ts_expire < ticks) { tcp_secret[idx].ts_secbits = arc4random(); tcp_secret[idx].ts_expire = ticks + SYNCOOKIE_TIMEOUT; } for (data = sizeof(tcp_msstab) / sizeof(int) - 1; data > 0; data--) if (tcp_msstab[data] <= sc->sc_peer_mss) break; data = (data << SYNCOOKIE_WNDBITS) | wnd; data ^= sc->sc_irs; /* peer's iss */ MD5Init(&syn_ctx); #ifdef INET6 if (sc->sc_inc.inc_isipv6) { MD5Add(sc->sc_inc.inc6_laddr); MD5Add(sc->sc_inc.inc6_faddr); } else #endif { MD5Add(sc->sc_inc.inc_laddr); MD5Add(sc->sc_inc.inc_faddr); } MD5Add(sc->sc_inc.inc_lport); MD5Add(sc->sc_inc.inc_fport); MD5Add(tcp_secret[idx].ts_secbits); MD5Final((u_char *)&md5_buffer, &syn_ctx); data ^= (md5_buffer[wnd & SYNCOOKIE_HASHMASK] & ~SYNCOOKIE_WNDMASK); return (data); } static struct syncache * syncookie_lookup(inc, th, so) struct in_conninfo *inc; struct tcphdr *th; struct socket *so; { u_int32_t md5_buffer[4]; struct syncache *sc; u_int32_t data; int wnd, idx; data = (th->th_ack - 1) ^ (th->th_seq - 1); /* remove ISS */ wnd = data & SYNCOOKIE_WNDMASK; idx = wnd >> SYNCOOKIE_HASHSHIFT; if (tcp_secret[idx].ts_expire < ticks || sototcpcb(so)->ts_recent + SYNCOOKIE_TIMEOUT < ticks) return (NULL); MD5Init(&syn_ctx); #ifdef INET6 if (inc->inc_isipv6) { MD5Add(inc->inc6_laddr); MD5Add(inc->inc6_faddr); } else #endif { MD5Add(inc->inc_laddr); MD5Add(inc->inc_faddr); } MD5Add(inc->inc_lport); MD5Add(inc->inc_fport); MD5Add(tcp_secret[idx].ts_secbits); MD5Final((u_char *)&md5_buffer, &syn_ctx); data ^= md5_buffer[wnd & SYNCOOKIE_HASHMASK]; if ((data & ~SYNCOOKIE_DATAMASK) != 0) return (NULL); data = data >> SYNCOOKIE_WNDBITS; sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT); if (sc == NULL) return (NULL); /* * Fill in the syncache values. * XXX duplicate code from syncache_add */ sc->sc_ipopts = NULL; sc->sc_inc.inc_fport = inc->inc_fport; sc->sc_inc.inc_lport = inc->inc_lport; #ifdef INET6 sc->sc_inc.inc_isipv6 = inc->inc_isipv6; if (inc->inc_isipv6) { sc->sc_inc.inc6_faddr = inc->inc6_faddr; sc->sc_inc.inc6_laddr = inc->inc6_laddr; sc->sc_route6.ro_rt = NULL; } else #endif { sc->sc_inc.inc_faddr = inc->inc_faddr; sc->sc_inc.inc_laddr = inc->inc_laddr; sc->sc_route.ro_rt = NULL; } sc->sc_irs = th->th_seq - 1; sc->sc_iss = th->th_ack - 1; wnd = sbspace(&so->so_rcv); wnd = imax(wnd, 0); wnd = imin(wnd, TCP_MAXWIN); sc->sc_wnd = wnd; sc->sc_flags = 0; sc->sc_rxtslot = 0; sc->sc_peer_mss = tcp_msstab[data]; return (sc); } Index: head/sys/netinet/tcp_timewait.c =================================================================== --- head/sys/netinet/tcp_timewait.c (revision 105193) +++ head/sys/netinet/tcp_timewait.c (revision 105194) @@ -1,1695 +1,1690 @@ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #define _IP_VHL #include #include #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #include #ifdef INET6 #include #endif #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif #include #ifdef IPSEC #include #ifdef INET6 #include #endif #endif /*IPSEC*/ #include #include int tcp_mssdflt = TCP_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); #ifdef INET6 int tcp_v6mssdflt = TCP6_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_RW, &tcp_v6mssdflt , 0, "Default TCP Maximum Segment Size for IPv6"); #endif #if 0 static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); #endif int tcp_do_rfc1323 = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); int tcp_do_rfc1644 = 0; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); static int tcp_tcbhashsize = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, &tcbinfo.ipi_count, 0, "Number of active PCBs"); static int icmp_may_rst = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static int tcp_isn_reseed_interval = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); /* * TCP bandwidth limiting sysctls. Note that the default lower bound of * 1024 exists only for debugging. A good production default would be * something like 6100. */ static int tcp_inflight_enable = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW, &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); static int tcp_inflight_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW, &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); static int tcp_inflight_min = 1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW, &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW, &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); static void tcp_cleartaocache(void); static struct inpcb *tcp_notify(struct inpcb *, int); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 512 #endif /* * This is the actual shape of what we allocate using the zone * allocator. Doing it this way allows us to protect both structures * using the same generation count, and also eliminates the overhead * of allocating tcpcbs separately. By hiding the structure here, * we avoid changing most of the rest of the code (although it needs * to be changed, eventually, for greater efficiency). */ #define ALIGNMENT 32 #define ALIGNM1 (ALIGNMENT - 1) struct inp_tp { union { struct inpcb inp; char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; } inp_tp_u; struct tcpcb tcb; struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl; struct callout inp_tp_delack; }; #undef ALIGNMENT #undef ALIGNM1 /* * Tcp initialization */ void tcp_init() { int hashsize = TCBHASHSIZE; tcp_ccgen = 1; tcp_cleartaocache(); tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; tcp_rexmit_min = TCPTV_MIN; tcp_rexmit_slop = TCPTV_CPU_VAR; INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); LIST_INIT(&tcb); tcbinfo.listhead = &tcb; TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); if (!powerof2(hashsize)) { printf("WARNING: TCB hash size not a power of 2\n"); hashsize = 512; /* safe default */ } tcp_tcbhashsize = hashsize; tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); tcbinfo.porthashbase = hashinit(hashsize, M_PCB, &tcbinfo.porthashmask); tcbinfo.ipi_zone = uma_zcreate("tcpcb", sizeof(struct inp_tp), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) max_protohdr = TCP_MINPROTOHDR; if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR syncache_init(); } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcp_fillheaders(tp, ip_ptr, tcp_ptr) struct tcpcb *tp; void *ip_ptr; void *tcp_ptr; { struct inpcb *inp = tp->t_inpcb; struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr; #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = sizeof(struct tcphdr); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; tcp_hdr->th_sum = 0; } else #endif { struct ip *ip = (struct ip *) ip_ptr; ip->ip_vhl = IP_VHL_BORING; ip->ip_tos = 0; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = 0; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP)); } tcp_hdr->th_sport = inp->inp_lport; tcp_hdr->th_dport = inp->inp_fport; tcp_hdr->th_seq = 0; tcp_hdr->th_ack = 0; tcp_hdr->th_x2 = 0; tcp_hdr->th_off = 5; tcp_hdr->th_flags = 0; tcp_hdr->th_win = 0; tcp_hdr->th_urp = 0; } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcp_maketemplate(tp) struct tcpcb *tp; { struct mbuf *m; struct tcptemp *n; m = m_get(M_DONTWAIT, MT_HEADER); if (m == NULL) return (0); m->m_len = sizeof(struct tcptemp); n = mtod(m, struct tcptemp *); tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t); return (n); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == 0, then we make a copy * of the tcpiphdr at ti and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the * segment ti, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then ti must point to *inside* the mbuf. */ void tcp_respond(tp, ipgen, th, m, ack, seq, flags) struct tcpcb *tp; void *ipgen; register struct tcphdr *th; register struct mbuf *m; tcp_seq ack, seq; int flags; { register int tlen; int win = 0; struct route *ro = 0; struct route sro; struct ip *ip; struct tcphdr *nth; #ifdef INET6 struct route_in6 *ro6 = 0; struct route_in6 sro6; struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int ipflags = 0; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); #ifdef INET6 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp) { if (!(flags & TH_RST)) { win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } #ifdef INET6 if (isipv6) ro6 = &tp->t_inpcb->in6p_route; else #endif /* INET6 */ ro = &tp->t_inpcb->inp_route; } else { #ifdef INET6 if (isipv6) { ro6 = &sro6; bzero(ro6, sizeof *ro6); } else #endif /* INET6 */ { ro = &sro; bzero(ro, sizeof *ro); } } if (m == 0) { m = m_gethdr(M_DONTWAIT, MT_HEADER); if (m == NULL) return; tlen = 0; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else { m_freem(m->m_next); m->m_next = 0; m->m_data = (caddr_t)ipgen; /* m_len is set later */ tlen = 0; #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, n_short); #undef xchg } #ifdef INET6 if (isipv6) { ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + tlen)); tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); } else #endif { tlen += sizeof (struct tcpiphdr); ip->ip_len = tlen; ip->ip_ttl = ip_defttl; } m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = (struct ifnet *) 0; #ifdef MAC if (tp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ mac_create_mbuf_from_socket(tp->t_inpcb->inp_socket, m); } else { /* * XXXMAC: This will need to call a mac function that * modifies the mbuf label in place for TCP datagrams * not associated with a PCB. */ } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; nth->th_off = sizeof (struct tcphdr) >> 2; nth->th_flags = flags; if (tp) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #ifdef INET6 if (isipv6) { nth->th_sum = 0; nth->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen - sizeof(struct ip6_hdr)); ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, ro6 && ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL); } else #endif /* INET6 */ { nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); } #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif -#ifdef IPSEC - if (ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) { - m_freem(m); - return; - } -#endif #ifdef INET6 if (isipv6) { - (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL); + (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL, + tp ? tp->t_inpcb : NULL); if (ro6 == &sro6 && ro6->ro_rt) { RTFREE(ro6->ro_rt); ro6->ro_rt = NULL; } } else #endif /* INET6 */ { - (void) ip_output(m, NULL, ro, ipflags, NULL); + (void) ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL); if (ro == &sro && ro->ro_rt) { RTFREE(ro->ro_rt); ro->ro_rt = NULL; } } } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(inp) struct inpcb *inp; { struct inp_tp *it; register struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ it = (struct inp_tp *)inp; tp = &it->tcb; bzero((char *) tp, sizeof(struct tcpcb)); LIST_INIT(&tp->t_segq); tp->t_maxseg = tp->t_maxopd = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif /* INET6 */ tcp_mssdflt; /* Set up our timeouts. */ callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0); callout_init(tp->tt_persist = &it->inp_tp_persist, 0); callout_init(tp->tt_keep = &it->inp_tp_keep, 0); callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0); callout_init(tp->tt_delack = &it->inp_tp_delack, 0); if (tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (tcp_do_rfc1644) tp->t_flags |= TF_REQ_CC; tp->t_inpcb = inp; /* XXX */ /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; tp->t_bw_rtttime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = ip_defttl; inp->inp_ppcb = (caddr_t)tp; return (tp); /* XXX */ } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(tp, errno) register struct tcpcb *tp; int errno; { struct socket *so = tp->t_inpcb->inp_socket; if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; (void) tcp_output(tp); tcpstat.tcps_drops++; } else tcpstat.tcps_conndrops++; if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } /* * Close a TCP control block: * discard all space held by the tcp * discard internet protocol block * wake up any sleepers */ struct tcpcb * tcp_close(tp) register struct tcpcb *tp; { register struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ register struct rtentry *rt; int dosavessthresh; /* * Make sure that all of our timers are stopped before we * delete the PCB. */ callout_stop(tp->tt_rexmt); callout_stop(tp->tt_persist); callout_stop(tp->tt_keep); callout_stop(tp->tt_2msl); callout_stop(tp->tt_delack); /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as the 16 samples. * 16 samples is enough for the srtt filter to converge * to within 5% of the correct value; fewer samples and * we could save a very bogus rtt. * * Don't update the default route's characteristics and don't * update anything that the user "locked". */ if (tp->t_rttupdated >= 16) { register u_long i = 0; #ifdef INET6 if (isipv6) { struct sockaddr_in6 *sin6; if ((rt = inp->in6p_route.ro_rt) == NULL) goto no_valid_rt; sin6 = (struct sockaddr_in6 *)rt_key(rt); if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) goto no_valid_rt; } else #endif /* INET6 */ if ((rt = inp->inp_route.ro_rt) == NULL || ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr == INADDR_ANY) goto no_valid_rt; if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { i = tp->t_srtt * (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); if (rt->rt_rmx.rmx_rtt && i) /* * filter this update to half the old & half * the new values, converting scale. * See route.h and tcp_var.h for a * description of the scaling constants. */ rt->rt_rmx.rmx_rtt = (rt->rt_rmx.rmx_rtt + i) / 2; else rt->rt_rmx.rmx_rtt = i; tcpstat.tcps_cachedrtt++; } if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { i = tp->t_rttvar * (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); if (rt->rt_rmx.rmx_rttvar && i) rt->rt_rmx.rmx_rttvar = (rt->rt_rmx.rmx_rttvar + i) / 2; else rt->rt_rmx.rmx_rttvar = i; tcpstat.tcps_cachedrttvar++; } /* * The old comment here said: * update the pipelimit (ssthresh) if it has been updated * already or if a pipesize was specified & the threshhold * got below half the pipesize. I.e., wait for bad news * before we start updating, then update on both good * and bad news. * * But we want to save the ssthresh even if no pipesize is * specified explicitly in the route, because such * connections still have an implicit pipesize specified * by the global tcp_sendspace. In the absence of a reliable * way to calculate the pipesize, it will have to do. */ i = tp->snd_ssthresh; if (rt->rt_rmx.rmx_sendpipe != 0) dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); else dosavessthresh = (i < so->so_snd.sb_hiwat / 2); if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && i != 0 && rt->rt_rmx.rmx_ssthresh != 0) || dosavessthresh) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ i = (i + tp->t_maxseg / 2) / tp->t_maxseg; if (i < 2) i = 2; i *= (u_long)(tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); if (rt->rt_rmx.rmx_ssthresh) rt->rt_rmx.rmx_ssthresh = (rt->rt_rmx.rmx_ssthresh + i) / 2; else rt->rt_rmx.rmx_ssthresh = i; tcpstat.tcps_cachedssthresh++; } } no_valid_rt: /* free the reassembly queue, if any */ while((q = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); FREE(q, M_TSEGQ); } inp->inp_ppcb = NULL; soisdisconnected(so); #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) in6_pcbdetach(inp); else #endif /* INET6 */ in_pcbdetach(inp); tcpstat.tcps_closed++; return ((struct tcpcb *)0); } void tcp_drain() { if (do_tcpdrain) { struct inpcb *inpb; struct tcpcb *tcpb; struct tseg_qent *te; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * usefull. */ INP_INFO_RLOCK(&tcbinfo); LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { INP_LOCK(inpb); if ((tcpb = intotcpcb(inpb))) { while ((te = LIST_FIRST(&tcpb->t_segq)) != NULL) { LIST_REMOVE(te, tqe_q); m_freem(te->tqe_m); FREE(te, M_TSEGQ); } } INP_UNLOCK(inpb); } INP_INFO_RUNLOCK(&tcbinfo); } } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(inp, error) struct inpcb *inp; int error; { struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { return inp; } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tcp_drop(tp, error); return (struct inpcb *)0; } else { tp->t_softerror = error; return inp; } #if 0 wakeup((caddr_t) &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n, s; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = tcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xtcpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ s = splnet(); INP_INFO_RLOCK(&tcbinfo); gencnt = tcbinfo.ipi_gencnt; n = tcbinfo.ipi_count; INP_INFO_RUNLOCK(&tcbinfo); splx(s); sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xtcpcb)); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return ENOMEM; s = splnet(); INP_INFO_RLOCK(&tcbinfo); for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) inp_list[i++] = inp; INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&tcbinfo); splx(s); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; caddr_t inp_ppcb; xt.xt_len = sizeof xt; /* XXX should avoid extra copy */ bcopy(inp, &xt.xt_inp, sizeof *inp); inp_ppcb = inp->inp_ppcb; if (inp_ppcb != NULL) bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); else bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xt.xt_socket); error = SYSCTL_OUT(req, &xt, sizeof xt); } INP_UNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ s = splnet(); INP_INFO_RLOCK(&tcbinfo); xig.xig_gen = tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = tcbinfo.ipi_count; INP_INFO_RUNLOCK(&tcbinfo); splx(s); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error, s; error = suser_cred(req->td->td_ucred, PRISON_ROOT); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); s = splnet(); INP_INFO_RLOCK(&tcbinfo); inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); if (inp == NULL) { error = ENOENT; goto outunlocked; } INP_LOCK(inp); if (inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: INP_UNLOCK(inp); outunlocked: INP_INFO_RUNLOCK(&tcbinfo); splx(s); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error, s, mapped = 0; error = suser_cred(req->td->td_ucred, PRISON_ROOT); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else return (EINVAL); } s = splnet(); INP_INFO_RLOCK(&tcbinfo); if (mapped == 1) inp = in_pcblookup_hash(&tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, 0, NULL); else inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); if (inp == NULL) { error = ENOENT; goto outunlocked; } INP_LOCK(inp); if (inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: INP_UNLOCK(inp); outunlocked: INP_INFO_RUNLOCK(&tcbinfo); splx(s); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif void tcp_ctlinput(cmd, sa, vip) int cmd; struct sockaddr *sa; void *vip; { struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; tcp_seq icmp_seq; int s; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (cmd == PRC_QUENCH) notify = tcp_quench; else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; else if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; else if (PRC_IS_REDIRECT(cmd)) { ip = 0; notify = in_rtchange; } else if (cmd == PRC_HOSTDEAD) ip = 0; else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip) { s = splnet(); th = (struct tcphdr *)((caddr_t)ip + (IP_VHL_HL(ip->ip_vhl) << 2)); INP_INFO_WLOCK(&tcbinfo); inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, 0, NULL); if (inp != NULL) { INP_LOCK(inp); if (inp->inp_socket != NULL) { icmp_seq = htonl(th->th_seq); tp = intotcpcb(inp); if (SEQ_GEQ(icmp_seq, tp->snd_una) && SEQ_LT(icmp_seq, tp->snd_max)) inp = (*notify)(inp, inetctlerrmap[cmd]); } if (inp) INP_UNLOCK(inp); } else { struct in_conninfo inc; inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; #ifdef INET6 inc.inc_isipv6 = 0; #endif syncache_unreach(&inc, th); } INP_INFO_WUNLOCK(&tcbinfo); splx(s); } else in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); } #ifdef INET6 void tcp6_ctlinput(cmd, sa, d) int cmd; struct sockaddr *sa; void *d; { struct tcphdr th; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; int off; struct tcp_portonly { u_int16_t th_sport; u_int16_t th_dport; } *thp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if (cmd == PRC_QUENCH) notify = tcp_quench; else if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; else if (!PRC_IS_REDIRECT(cmd) && ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; } if (ip6) { struct in_conninfo inc; /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* check if we can safely examine src and dst ports */ if (m->m_pkthdr.len < off + sizeof(*thp)) return; bzero(&th, sizeof(th)); m_copydata(m, off, sizeof(*thp), (caddr_t)&th); in6_pcbnotify(&tcb, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, th.th_sport, cmd, notify); inc.inc_fport = th.th_dport; inc.inc_lport = th.th_sport; inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; inc.inc_isipv6 = 1; syncache_unreach(&inc, &th); } else in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, notify); } #endif /* INET6 */ /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used to generate sequence numbers. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * */ #define ISN_BYTES_PER_SECOND 1048576 u_char isn_secret[32]; int isn_last_reseed; MD5_CTX isn_ctx; tcp_seq tcp_new_isn(tp) struct tcpcb *tp; { u_int32_t md5_buffer[4]; tcp_seq new_isn; /* Seed if this is the first use, reseed if requested. */ if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) < (u_int)ticks))) { read_random(&isn_secret, sizeof(isn_secret)); isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ MD5Init(&isn_ctx); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, sizeof(struct in6_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, sizeof(struct in6_addr)); } else #endif { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, sizeof(struct in_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, sizeof(struct in_addr)); } MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); MD5Final((u_char *) &md5_buffer, &isn_ctx); new_isn = (tcp_seq) md5_buffer[0]; new_isn += ticks * (ISN_BYTES_PER_SECOND / hz); return new_isn; } /* * When a source quench is received, close congestion window * to one segment. We will gradually open it again as we proceed. */ struct inpcb * tcp_quench(inp, errno) struct inpcb *inp; int errno; { struct tcpcb *tp = intotcpcb(inp); if (tp) tp->snd_cwnd = tp->t_maxseg; return (inp); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ struct inpcb * tcp_drop_syn_sent(inp, errno) struct inpcb *inp; int errno; { struct tcpcb *tp = intotcpcb(inp); if (tp && tp->t_state == TCPS_SYN_SENT) { tcp_drop(tp, errno); return (struct inpcb *)0; } return inp; } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value in the route. Also nudge TCP to send something, * since we know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ struct inpcb * tcp_mtudisc(inp, errno) struct inpcb *inp; int errno; { struct tcpcb *tp = intotcpcb(inp); struct rtentry *rt; struct rmxp_tao *taop; struct socket *so = inp->inp_socket; int offered; int mss; #ifdef INET6 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ if (tp) { #ifdef INET6 if (isipv6) rt = tcp_rtlookup6(&inp->inp_inc); else #endif /* INET6 */ rt = tcp_rtlookup(&inp->inp_inc); if (!rt || !rt->rt_rmx.rmx_mtu) { tp->t_maxopd = tp->t_maxseg = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif /* INET6 */ tcp_mssdflt; return inp; } taop = rmx_taop(rt->rt_rmx); offered = taop->tao_mssopt; mss = rt->rt_rmx.rmx_mtu - #ifdef INET6 (isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : #endif /* INET6 */ sizeof(struct tcpiphdr) #ifdef INET6 ) #endif /* INET6 */ ; if (offered) mss = min(mss, offered); /* * XXX - The above conditional probably violates the TCP * spec. The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ if (tp->t_maxopd <= mss) return inp; tp->t_maxopd = mss; if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) mss -= TCPOLEN_TSTAMP_APPA; if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) mss -= TCPOLEN_CC_APPA; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif if (so->so_snd.sb_hiwat < mss) mss = so->so_snd.sb_hiwat; tp->t_maxseg = mss; tcpstat.tcps_mturesent++; tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_output(tp); } return inp; } /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated the return NULL. This routine * is called by TCP routines that access the rmx structure and by tcp_mss * to get the interface MTU. */ struct rtentry * tcp_rtlookup(inc) struct in_conninfo *inc; { struct route *ro; struct rtentry *rt; ro = &inc->inc_route; rt = ro->ro_rt; if (rt == NULL || !(rt->rt_flags & RTF_UP)) { /* No route yet, so try to acquire one */ if (inc->inc_faddr.s_addr != INADDR_ANY) { ro->ro_dst.sa_family = AF_INET; ro->ro_dst.sa_len = sizeof(struct sockaddr_in); ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = inc->inc_faddr; rtalloc(ro); rt = ro->ro_rt; } } return rt; } #ifdef INET6 struct rtentry * tcp_rtlookup6(inc) struct in_conninfo *inc; { struct route_in6 *ro6; struct rtentry *rt; ro6 = &inc->inc6_route; rt = ro6->ro_rt; if (rt == NULL || !(rt->rt_flags & RTF_UP)) { /* No route yet, so try to acquire one */ if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { ro6->ro_dst.sin6_family = AF_INET6; ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6); ro6->ro_dst.sin6_addr = inc->inc6_faddr; rtalloc((struct route *)ro6); rt = ro6->ro_rt; } } return rt; } #endif /* INET6 */ #ifdef IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t ipsec_hdrsiz_tcp(tp) struct tcpcb *tp; { struct inpcb *inp; struct mbuf *m; size_t hdrsiz; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif /* INET6 */ struct tcphdr *th; if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) return 0; MGETHDR(m, M_DONTWAIT, MT_DATA); if (!m) return 0; #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); tcp_fillheaders(tp, ip6, th); hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); tcp_fillheaders(tp, ip, th); hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } m_free(m); return hdrsiz; } #endif /*IPSEC*/ /* * Return a pointer to the cached information about the remote host. * The cached information is stored in the protocol specific part of * the route metrics. */ struct rmxp_tao * tcp_gettaocache(inc) struct in_conninfo *inc; { struct rtentry *rt; #ifdef INET6 if (inc->inc_isipv6) rt = tcp_rtlookup6(inc); else #endif /* INET6 */ rt = tcp_rtlookup(inc); /* Make sure this is a host route and is up. */ if (rt == NULL || (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) return NULL; return rmx_taop(rt->rt_rmx); } /* * Clear all the TAO cache entries, called from tcp_init. * * XXX * This routine is just an empty one, because we assume that the routing * routing tables are initialized at the same time when TCP, so there is * nothing in the cache left over. */ static void tcp_cleartaocache() { } /* * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING * * This code attempts to calculate the bandwidth-delay product as a * means of determining the optimal window size to maximize bandwidth, * minimize RTT, and avoid the over-allocation of buffers on interfaces and * routers. This code also does a fairly good job keeping RTTs in check * across slow links like modems. We implement an algorithm which is very * similar (but not meant to be) TCP/Vegas. The code operates on the * transmitter side of a TCP connection and so only effects the transmit * side of the connection. * * BACKGROUND: TCP makes no provision for the management of buffer space * at the end points or at the intermediate routers and switches. A TCP * stream, whether using NewReno or not, will eventually buffer as * many packets as it is able and the only reason this typically works is * due to the fairly small default buffers made available for a connection * (typicaly 16K or 32K). As machines use larger windows and/or window * scaling it is now fairly easy for even a single TCP connection to blow-out * all available buffer space not only on the local interface, but on * intermediate routers and switches as well. NewReno makes a misguided * attempt to 'solve' this problem by waiting for an actual failure to occur, * then backing off, then steadily increasing the window again until another * failure occurs, ad-infinitum. This results in terrible oscillation that * is only made worse as network loads increase and the idea of intentionally * blowing out network buffers is, frankly, a terrible way to manage network * resources. * * It is far better to limit the transmit window prior to the failure * condition being achieved. There are two general ways to do this: First * you can 'scan' through different transmit window sizes and locate the * point where the RTT stops increasing, indicating that you have filled the * pipe, then scan backwards until you note that RTT stops decreasing, then * repeat ad-infinitum. This method works in principle but has severe * implementation issues due to RTT variances, timer granularity, and * instability in the algorithm which can lead to many false positives and * create oscillations as well as interact badly with other TCP streams * implementing the same algorithm. * * The second method is to limit the window to the bandwidth delay product * of the link. This is the method we implement. RTT variances and our * own manipulation of the congestion window, bwnd, can potentially * destabilize the algorithm. For this reason we have to stabilize the * elements used to calculate the window. We do this by using the minimum * observed RTT, the long term average of the observed bandwidth, and * by adding two segments worth of slop. It isn't perfect but it is able * to react to changing conditions and gives us a very stable basis on * which to extend the algorithm. */ void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) { u_long bw; u_long bwnd; int save_ticks; /* * If inflight_enable is disabled in the middle of a tcp connection, * make sure snd_bwnd is effectively disabled. */ if (tcp_inflight_enable == 0) { tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bandwidth = 0; return; } /* * Figure out the bandwidth. Due to the tick granularity this * is a very rough number and it MUST be averaged over a fairly * long period of time. XXX we need to take into account a link * that is not using all available bandwidth, but for now our * slop will ramp us up if this case occurs and the bandwidth later * increases. * * Note: if ticks rollover 'bw' may wind up negative. We must * effectively reset t_bw_rtttime for this case. */ save_ticks = ticks; if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) return; bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / (save_ticks - tp->t_bw_rtttime); tp->t_bw_rtttime = save_ticks; tp->t_bw_rtseq = ack_seq; if (tp->t_bw_rtttime == 0 || (int)bw < 0) return; bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; tp->snd_bandwidth = bw; /* * Calculate the semi-static bandwidth delay product, plus two maximal * segments. The additional slop puts us squarely in the sweet * spot and also handles the bandwidth run-up case. Without the * slop we could be locking ourselves into a lower bandwidth. * * Situations Handled: * (1) Prevents over-queueing of packets on LANs, especially on * high speed LANs, allowing larger TCP buffers to be * specified, and also does a good job preventing * over-queueing of packets over choke points like modems * (at least for the transmit side). * * (2) Is able to handle changing network loads (bandwidth * drops so bwnd drops, bandwidth increases so bwnd * increases). * * (3) Theoretically should stabilize in the face of multiple * connections implementing the same algorithm (this may need * a little work). */ #define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + 2 * tp->t_maxseg; #undef USERTT if (tcp_inflight_debug > 0) { static int ltime; if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { ltime = ticks; printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", tp, bw, tp->t_rttbest, tp->t_srtt, bwnd ); } } if ((long)bwnd < tcp_inflight_min) bwnd = tcp_inflight_min; if (bwnd > tcp_inflight_max) bwnd = tcp_inflight_max; if ((long)bwnd < tp->t_maxseg * 2) bwnd = tp->t_maxseg * 2; tp->snd_bwnd = bwnd; } Index: head/sys/netinet/udp_usrreq.c =================================================================== --- head/sys/netinet/udp_usrreq.c (revision 105193) +++ head/sys/netinet/udp_usrreq.c (revision 105194) @@ -1,1077 +1,1071 @@ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 * $FreeBSD$ */ #include "opt_ipsec.h" #include "opt_inet6.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #ifdef INET6 #include #endif #include #include #ifdef IPSEC #include #endif /*IPSEC*/ #include /* * UDP protocol implementation. * Per RFC 768, August, 1980. */ #ifndef COMPAT_42 static int udpcksum = 1; #else static int udpcksum = 0; /* XXX */ #endif SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, &udpcksum, 0, ""); int log_in_vain = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, &log_in_vain, 0, "Log all incoming UDP packets"); static int blackhole = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, &blackhole, 0, "Do not send port unreachables for refused connects"); struct inpcbhead udb; /* from udp_var.h */ #define udb6 udb /* for KAME src sync over BSD*'s */ struct inpcbinfo udbinfo; #ifndef UDBHASHSIZE #define UDBHASHSIZE 16 #endif struct udpstat udpstat; /* from udp_var.h */ SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); static struct sockaddr_in udp_in = { sizeof(udp_in), AF_INET }; #ifdef INET6 struct udp_in6 { struct sockaddr_in6 uin6_sin; u_char uin6_init_done : 1; } udp_in6 = { { sizeof(udp_in6.uin6_sin), AF_INET6 }, 0 }; struct udp_ip6 { struct ip6_hdr uip6_ip6; u_char uip6_init_done : 1; } udp_ip6; #endif /* INET6 */ static void udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off); #ifdef INET6 static void ip_2_ip6_hdr(struct ip6_hdr *ip6, struct ip *ip); #endif static int udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); void udp_init() { INP_INFO_LOCK_INIT(&udbinfo, "udp"); LIST_INIT(&udb); udbinfo.listhead = &udb; udbinfo.hashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.hashmask); udbinfo.porthashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.porthashmask); udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(udbinfo.ipi_zone, maxsockets); } void udp_input(m, off) register struct mbuf *m; int off; { int iphlen = off; register struct ip *ip; register struct udphdr *uh; register struct inpcb *inp; struct mbuf *opts = 0; int len; struct ip save_ip; struct sockaddr *append_sa; #ifdef MAC int error; #endif udpstat.udps_ipackets++; /* * Strip IP options, if any; should skip this, * make available to user, and use on returned packets, * but we don't yet have a way to check the checksum * with options still present. */ if (iphlen > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); iphlen = sizeof(struct ip); } /* * Get IP and UDP header together in first mbuf. */ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) { udpstat.udps_hdrops++; return; } ip = mtod(m, struct ip *); } uh = (struct udphdr *)((caddr_t)ip + iphlen); /* destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; /* * Make mbuf data length reflect UDP length. * If not enough data to reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); if (ip->ip_len != len) { if (len > ip->ip_len || len < sizeof(struct udphdr)) { udpstat.udps_badlen++; goto badunlocked; } m_adj(m, len - ip->ip_len); /* ip->ip_len = len; */ } /* * Save a copy of the IP header in case we want restore it * for sending an ICMP error message in response. */ if (!blackhole) save_ip = *ip; /* * Checksum extended UDP header and data. */ if (uh->uh_sum) { if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh->uh_sum = m->m_pkthdr.csum_data; else uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_UDP)); uh->uh_sum ^= 0xffff; } else { char b[9]; bcopy(((struct ipovly *)ip)->ih_x1, b, 9); bzero(((struct ipovly *)ip)->ih_x1, 9); ((struct ipovly *)ip)->ih_len = uh->uh_ulen; uh->uh_sum = in_cksum(m, len + sizeof (struct ip)); bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); } if (uh->uh_sum) { udpstat.udps_badsum++; m_freem(m); return; } } else udpstat.udps_nosum++; INP_INFO_RLOCK(&udbinfo); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { struct inpcb *last; /* * Deliver a multicast or broadcast datagram to *all* sockets * for which the local and remote addresses and ports match * those of the incoming datagram. This allows more than * one process to receive multi/broadcasts on the same port. * (This really ought to be done for unicast datagrams as * well, but that would cause problems with existing * applications that open both address-specific sockets and * a wildcard socket listening to the same port -- they would * end up receiving duplicates of every unicast datagram. * Those applications open the multiple sockets to overcome an * inadequacy of the UDP socket interface, but for backwards * compatibility we avoid the problem here rather than * fixing the interface. Maybe 4.5BSD will remedy this?) */ /* * Construct sockaddr format source address. */ udp_in.sin_port = uh->uh_sport; udp_in.sin_addr = ip->ip_src; /* * Locate pcb(s) for datagram. * (Algorithm copied from raw_intr().) */ last = NULL; #ifdef INET6 udp_in6.uin6_init_done = udp_ip6.uip6_init_done = 0; #endif LIST_FOREACH(inp, &udb, inp_list) { INP_LOCK(inp); if (inp->inp_lport != uh->uh_dport) { docontinue: INP_UNLOCK(inp); continue; } #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) goto docontinue; #endif if (inp->inp_laddr.s_addr != INADDR_ANY) { if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) goto docontinue; } if (inp->inp_faddr.s_addr != INADDR_ANY) { if (inp->inp_faddr.s_addr != ip->ip_src.s_addr || inp->inp_fport != uh->uh_sport) goto docontinue; } if (last != NULL) { struct mbuf *n; int policyfail; policyfail = 0; #ifdef IPSEC /* check AH/ESP integrity. */ if (ipsec4_in_reject_so(m, last->inp_socket)) { ipsecstat.in_polvio++; policyfail = 1; /* do not inject data to pcb */ } #endif /*IPSEC*/ #ifdef MAC if (mac_check_socket_deliver(last->inp_socket, m) != 0) policyfail = 1; #endif if (!policyfail) { n = m_copy(m, 0, M_COPYALL); if (n != NULL) udp_append(last, ip, n, iphlen + sizeof(struct udphdr)); } INP_UNLOCK(last); } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids searching * through all pcbs in the common case of a non-shared * port. It * assumes that an application will never * clear these options after setting them. */ if ((last->inp_socket->so_options&(SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. * (No need to send an ICMP Port Unreachable * for a broadcast or multicast datgram.) */ udpstat.udps_noportbcast++; goto badheadlocked; } #ifdef IPSEC /* check AH/ESP integrity. */ if (ipsec4_in_reject_so(m, last->inp_socket)) { ipsecstat.in_polvio++; goto badheadlocked; } #endif /*IPSEC*/ INP_UNLOCK(last); INP_INFO_RUNLOCK(&udbinfo); udp_append(last, ip, m, iphlen + sizeof(struct udphdr)); return; } /* * Locate pcb for datagram. */ inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif); if (inp == NULL) { if (log_in_vain) { char buf[4*sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_dst)); log(LOG_INFO, "Connection attempt to UDP %s:%d from %s:%d\n", buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), ntohs(uh->uh_sport)); } udpstat.udps_noport++; if (m->m_flags & (M_BCAST | M_MCAST)) { udpstat.udps_noportbcast++; goto badheadlocked; } if (blackhole) goto badheadlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badheadlocked; *ip = save_ip; ip->ip_len += iphlen; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); INP_INFO_RUNLOCK(&udbinfo); return; } INP_LOCK(inp); INP_INFO_RUNLOCK(&udbinfo); #ifdef IPSEC if (ipsec4_in_reject_so(m, inp->inp_socket)) { ipsecstat.in_polvio++; goto bad; } #endif /*IPSEC*/ #ifdef MAC error = mac_check_socket_deliver(inp->inp_socket, m); if (error) goto bad; #endif /* * Construct sockaddr format source address. * Stuff source address and datagram in user buffer. */ udp_in.sin_port = uh->uh_sport; udp_in.sin_addr = ip->ip_src; if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & SO_TIMESTAMP) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { int savedflags; ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip); savedflags = inp->inp_flags; inp->inp_flags &= ~INP_UNMAPPABLEOPTS; ip6_savecontrol(inp, &opts, &udp_ip6.uip6_ip6, m); inp->inp_flags = savedflags; } else #endif ip_savecontrol(inp, &opts, ip, m); } m_adj(m, iphlen + sizeof(struct udphdr)); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin); append_sa = (struct sockaddr *)&udp_in6; } else #endif append_sa = (struct sockaddr *)&udp_in; if (sbappendaddr(&inp->inp_socket->so_rcv, append_sa, m, opts) == 0) { udpstat.udps_fullsock++; goto bad; } sorwakeup(inp->inp_socket); INP_UNLOCK(inp); return; badheadlocked: INP_INFO_RUNLOCK(&udbinfo); bad: if (inp) INP_UNLOCK(inp); badunlocked: m_freem(m); if (opts) m_freem(opts); return; } #ifdef INET6 static void ip_2_ip6_hdr(ip6, ip) struct ip6_hdr *ip6; struct ip *ip; { bzero(ip6, sizeof(*ip6)); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = ip->ip_len; ip6->ip6_nxt = ip->ip_p; ip6->ip6_hlim = ip->ip_ttl; ip6->ip6_src.s6_addr32[2] = ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_SMP; ip6->ip6_src.s6_addr32[3] = ip->ip_src.s_addr; ip6->ip6_dst.s6_addr32[3] = ip->ip_dst.s_addr; } #endif /* * subroutine of udp_input(), mainly for source code readability. * caller must properly init udp_ip6 and udp_in6 beforehand. */ static void udp_append(last, ip, n, off) struct inpcb *last; struct ip *ip; struct mbuf *n; int off; { struct sockaddr *append_sa; struct mbuf *opts = 0; if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & SO_TIMESTAMP) { #ifdef INET6 if (last->inp_vflag & INP_IPV6) { int savedflags; if (udp_ip6.uip6_init_done == 0) { ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip); udp_ip6.uip6_init_done = 1; } savedflags = last->inp_flags; last->inp_flags &= ~INP_UNMAPPABLEOPTS; ip6_savecontrol(last, &opts, &udp_ip6.uip6_ip6, n); last->inp_flags = savedflags; } else #endif ip_savecontrol(last, &opts, ip, n); } #ifdef INET6 if (last->inp_vflag & INP_IPV6) { if (udp_in6.uin6_init_done == 0) { in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin); udp_in6.uin6_init_done = 1; } append_sa = (struct sockaddr *)&udp_in6.uin6_sin; } else #endif append_sa = (struct sockaddr *)&udp_in; m_adj(n, off); if (sbappendaddr(&last->inp_socket->so_rcv, append_sa, n, opts) == 0) { m_freem(n); if (opts) m_freem(opts); udpstat.udps_fullsock++; } else sorwakeup(last->inp_socket); } /* * Notify a udp user of an asynchronous error; * just wake up so that he can collect error status. */ struct inpcb * udp_notify(inp, errno) register struct inpcb *inp; int errno; { inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); sowwakeup(inp->inp_socket); return inp; } void udp_ctlinput(cmd, sa, vip) int cmd; struct sockaddr *sa; void *vip; { struct ip *ip = vip; struct udphdr *uh; struct inpcb *(*notify)(struct inpcb *, int) = udp_notify; struct in_addr faddr; struct inpcb *inp; int s; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (PRC_IS_REDIRECT(cmd)) { ip = 0; notify = in_rtchange; } else if (cmd == PRC_HOSTDEAD) ip = 0; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip) { s = splnet(); uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_RLOCK(&udbinfo); inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, 0, NULL); if (inp != NULL) { INP_LOCK(inp); if(inp->inp_socket != NULL) { (*notify)(inp, inetctlerrmap[cmd]); } INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&udbinfo); splx(s); } else in_pcbnotifyall(&udbinfo, faddr, inetctlerrmap[cmd], notify); } static int udp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n, s; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = udbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ s = splnet(); gencnt = udbinfo.ipi_gencnt; n = udbinfo.ipi_count; splx(s); sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xinpcb)); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return ENOMEM; s = splnet(); INP_INFO_RLOCK(&udbinfo); for (inp = LIST_FIRST(udbinfo.listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) inp_list[i++] = inp; INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&udbinfo); splx(s); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); error = SYSCTL_OUT(req, &xi, sizeof xi); } INP_UNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ s = splnet(); INP_INFO_RLOCK(&udbinfo); xig.xig_gen = udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = udbinfo.ipi_count; INP_INFO_RUNLOCK(&udbinfo); splx(s); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); static int udp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error, s; error = suser_cred(req->td->td_ucred, PRISON_ROOT); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); s = splnet(); INP_INFO_RLOCK(&udbinfo); inp = in_pcblookup_hash(&udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); if (inp == NULL || inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: INP_INFO_RUNLOCK(&udbinfo); splx(s); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); static int udp_output(inp, m, addr, control, td) register struct inpcb *inp; struct mbuf *m; struct sockaddr *addr; struct mbuf *control; struct thread *td; { register struct udpiphdr *ui; register int len = m->m_pkthdr.len; struct in_addr laddr; struct sockaddr_in *sin; int s = 0, error = 0; #ifdef MAC mac_create_mbuf_from_socket(inp->inp_socket, m); #endif if (control) m_freem(control); /* XXX */ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { error = EMSGSIZE; goto release; } if (addr) { sin = (struct sockaddr_in *)addr; if (td && jailed(td->td_ucred)) prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr); laddr = inp->inp_laddr; if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto release; } /* * Must block input while temporarily connected. */ s = splnet(); error = in_pcbconnect(inp, addr, td); if (error) { splx(s); goto release; } } else { if (inp->inp_faddr.s_addr == INADDR_ANY) { error = ENOTCONN; goto release; } } /* * Calculate data length and get a mbuf * for UDP and IP headers. */ M_PREPEND(m, sizeof(struct udpiphdr), M_DONTWAIT); if (m == 0) { error = ENOBUFS; if (addr) splx(s); goto release; } /* * Fill in mbuf with extended UDP header * and addresses and length put into network format. */ ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ ui->ui_pr = IPPROTO_UDP; ui->ui_src = inp->inp_laddr; ui->ui_dst = inp->inp_faddr; ui->ui_sport = inp->inp_lport; ui->ui_dport = inp->inp_fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); /* * Set up checksum and output datagram. */ if (udpcksum) { ui->ui_sum = in_pseudo(ui->ui_src.s_addr, ui->ui_dst.s_addr, htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } else { ui->ui_sum = 0; } ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len; ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */ udpstat.udps_opackets++; -#ifdef IPSEC - if (ipsec_setsocket(m, inp->inp_socket) != 0) { - error = ENOBUFS; - goto release; - } -#endif /*IPSEC*/ error = ip_output(m, inp->inp_options, &inp->inp_route, (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST)), - inp->inp_moptions); + inp->inp_moptions, inp); if (addr) { in_pcbdisconnect(inp); inp->inp_laddr = laddr; /* XXX rehash? */ splx(s); } return (error); release: m_freem(m); return (error); } u_long udp_sendspace = 9216; /* really max datagram size */ /* 40 1K datagrams */ SYSCTL_INT(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); u_long udp_recvspace = 40 * (1024 + #ifdef INET6 sizeof(struct sockaddr_in6) #else sizeof(struct sockaddr_in) #endif ); SYSCTL_INT(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum incoming UDP datagram size"); static int udp_abort(struct socket *so) { struct inpcb *inp; int s; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; /* ??? possible? panic instead? */ } INP_LOCK(inp); soisdisconnected(so); s = splnet(); in_pcbdetach(inp); INP_INFO_WUNLOCK(&udbinfo); splx(s); return 0; } static int udp_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int s, error; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp != 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } error = soreserve(so, udp_sendspace, udp_recvspace); if (error) { INP_INFO_WUNLOCK(&udbinfo); return error; } s = splnet(); error = in_pcballoc(so, &udbinfo, td); splx(s); if (error) return error; inp = (struct inpcb *)so->so_pcb; INP_LOCK(inp); INP_INFO_WUNLOCK(&udbinfo); inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = ip_defttl; INP_UNLOCK(inp); return 0; } static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int s, error; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); s = splnet(); error = in_pcbbind(inp, nam, td); splx(s); INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return error; } static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int s, error; struct sockaddr_in *sin; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return EISCONN; } s = splnet(); sin = (struct sockaddr_in *)nam; if (td && jailed(td->td_ucred)) prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr); error = in_pcbconnect(inp, nam, td); splx(s); if (error == 0) soisconnected(so); INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return error; } static int udp_detach(struct socket *so) { struct inpcb *inp; int s; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); s = splnet(); in_pcbdetach(inp); INP_INFO_WUNLOCK(&udbinfo); splx(s); return 0; } static int udp_disconnect(struct socket *so) { struct inpcb *inp; int s; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { INP_INFO_WUNLOCK(&udbinfo); INP_UNLOCK(inp); return ENOTCONN; } s = splnet(); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); splx(s); so->so_state &= ~SS_ISCONNECTED; /* XXX */ return 0; } static int udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; int ret; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); m_freem(m); return EINVAL; } INP_LOCK(inp); ret = udp_output(inp, m, addr, control, td); INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return ret; } int udp_shutdown(struct socket *so) { struct inpcb *inp; INP_INFO_RLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_RUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); INP_INFO_RUNLOCK(&udbinfo); socantsendmore(so); INP_UNLOCK(inp); return 0; } /* * This is the wrapper function for in_setsockaddr. We just pass down * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking * here because in_setsockaddr will call malloc and might block. */ static int udp_sockaddr(struct socket *so, struct sockaddr **nam) { return (in_setsockaddr(so, nam, &udbinfo)); } /* * This is the wrapper function for in_setpeeraddr. We just pass down * the pcbinfo for in_setpeeraddr to lock. */ static int udp_peeraddr(struct socket *so, struct sockaddr **nam) { return (in_setpeeraddr(so, nam, &udbinfo)); } struct pr_usrreqs udp_usrreqs = { udp_abort, pru_accept_notsupp, udp_attach, udp_bind, udp_connect, pru_connect2_notsupp, in_control, udp_detach, udp_disconnect, pru_listen_notsupp, udp_peeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, udp_send, pru_sense_null, udp_shutdown, udp_sockaddr, sosend, soreceive, sopoll }; Index: head/sys/netinet6/icmp6.c =================================================================== --- head/sys/netinet6/icmp6.c (revision 105193) +++ head/sys/netinet6/icmp6.c (revision 105194) @@ -1,2869 +1,2861 @@ /* $FreeBSD$ */ /* $KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif #include #ifdef HAVE_NRL_INPCB /* inpcb members */ #define in6pcb inpcb #define in6p_laddr inp_laddr6 #define in6p_faddr inp_faddr6 #define in6p_icmp6filt inp_icmp6filt #define in6p_route inp_route #define in6p_socket inp_socket #define in6p_flags inp_flags #define in6p_moptions inp_moptions6 #define in6p_outputopts inp_outputopts6 #define in6p_ip6 inp_ipv6 #define in6p_flowinfo inp_flowinfo #define in6p_sp inp_sp #define in6p_next inp_next #define in6p_prev inp_prev /* macro names */ #define sotoin6pcb sotoinpcb /* function names */ #define in6_pcbdetach in_pcbdetach #define in6_rtchange in_rtchange /* * for KAME src sync over BSD*'s. XXX: FreeBSD (>=3) are VERY different from * others... */ #define in6p_ip6_nxt inp_ipv6.ip6_nxt #endif extern struct domain inet6domain; struct icmp6stat icmp6stat; extern struct inpcbhead ripcb; extern int icmp6errppslim; static int icmp6errpps_count = 0; static struct timeval icmp6errppslim_last; extern int icmp6_nodeinfo; static void icmp6_errcount __P((struct icmp6errstat *, int, int)); static int icmp6_rip6_input __P((struct mbuf **, int)); static int icmp6_ratelimit __P((const struct in6_addr *, const int, const int)); static const char *icmp6_redirect_diag __P((struct in6_addr *, struct in6_addr *, struct in6_addr *)); #ifndef HAVE_PPSRATECHECK static int ppsratecheck __P((struct timeval *, int *, int)); #endif static struct mbuf *ni6_input __P((struct mbuf *, int)); static struct mbuf *ni6_nametodns __P((const char *, int, int)); static int ni6_dnsmatch __P((const char *, int, const char *, int)); static int ni6_addrs __P((struct icmp6_nodeinfo *, struct mbuf *, struct ifnet **, char *)); static int ni6_store_addrs __P((struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, struct ifnet *, int)); static int icmp6_notify_error __P((struct mbuf *, int, int, int)); #ifdef COMPAT_RFC1885 static struct route_in6 icmp6_reflect_rt; #endif void icmp6_init() { mld6_init(); } static void icmp6_errcount(stat, type, code) struct icmp6errstat *stat; int type, code; { switch (type) { case ICMP6_DST_UNREACH: switch (code) { case ICMP6_DST_UNREACH_NOROUTE: stat->icp6errs_dst_unreach_noroute++; return; case ICMP6_DST_UNREACH_ADMIN: stat->icp6errs_dst_unreach_admin++; return; case ICMP6_DST_UNREACH_BEYONDSCOPE: stat->icp6errs_dst_unreach_beyondscope++; return; case ICMP6_DST_UNREACH_ADDR: stat->icp6errs_dst_unreach_addr++; return; case ICMP6_DST_UNREACH_NOPORT: stat->icp6errs_dst_unreach_noport++; return; } break; case ICMP6_PACKET_TOO_BIG: stat->icp6errs_packet_too_big++; return; case ICMP6_TIME_EXCEEDED: switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: stat->icp6errs_time_exceed_transit++; return; case ICMP6_TIME_EXCEED_REASSEMBLY: stat->icp6errs_time_exceed_reassembly++; return; } break; case ICMP6_PARAM_PROB: switch (code) { case ICMP6_PARAMPROB_HEADER: stat->icp6errs_paramprob_header++; return; case ICMP6_PARAMPROB_NEXTHEADER: stat->icp6errs_paramprob_nextheader++; return; case ICMP6_PARAMPROB_OPTION: stat->icp6errs_paramprob_option++; return; } break; case ND_REDIRECT: stat->icp6errs_redirect++; return; } stat->icp6errs_unknown++; } /* * Generate an error packet of type error in response to bad IP6 packet. */ void icmp6_error(m, type, code, param) struct mbuf *m; int type, code, param; { struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; u_int preplen; int off; int nxt; icmp6stat.icp6s_error++; /* count per-type-code statistics */ icmp6_errcount(&icmp6stat.icp6s_outerrhist, type, code); #ifdef M_DECRYPTED /*not openbsd*/ if (m->m_flags & M_DECRYPTED) { icmp6stat.icp6s_canterror++; goto freeit; } #endif #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif oip6 = mtod(m, struct ip6_hdr *); /* * Multicast destination check. For unrecognized option errors, * this check has already done in ip6_unknown_opt(), so we can * check only for other errors. */ if ((m->m_flags & (M_BCAST|M_MCAST) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && (type != ICMP6_PACKET_TOO_BIG && (type != ICMP6_PARAM_PROB || code != ICMP6_PARAMPROB_OPTION))) goto freeit; /* Source address check. XXX: the case of anycast source? */ if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_src)) goto freeit; /* * If we are about to send ICMPv6 against ICMPv6 error/redirect, * don't do it. */ nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off >= 0 && nxt == IPPROTO_ICMPV6) { struct icmp6_hdr *icp; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), ); icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off, sizeof(*icp)); if (icp == NULL) { icmp6stat.icp6s_tooshort++; return; } #endif if (icp->icmp6_type < ICMP6_ECHO_REQUEST || icp->icmp6_type == ND_REDIRECT) { /* * ICMPv6 error * Special case: for redirect (which is * informational) we must not send icmp6 error. */ icmp6stat.icp6s_canterror++; goto freeit; } else { /* ICMPv6 informational - send the error */ } } else { /* non-ICMPv6 - send the error */ } oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */ /* Finally, do rate limitation check. */ if (icmp6_ratelimit(&oip6->ip6_src, type, code)) { icmp6stat.icp6s_toofreq++; goto freeit; } /* * OK, ICMP6 can be generated. */ if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN) m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len); preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); M_PREPEND(m, preplen, M_DONTWAIT); if (m && m->m_len < preplen) m = m_pullup(m, preplen); if (m == NULL) { nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__)); return; } nip6 = mtod(m, struct ip6_hdr *); nip6->ip6_src = oip6->ip6_src; nip6->ip6_dst = oip6->ip6_dst; if (IN6_IS_SCOPE_LINKLOCAL(&oip6->ip6_src)) oip6->ip6_src.s6_addr16[1] = 0; if (IN6_IS_SCOPE_LINKLOCAL(&oip6->ip6_dst)) oip6->ip6_dst.s6_addr16[1] = 0; icmp6 = (struct icmp6_hdr *)(nip6 + 1); icmp6->icmp6_type = type; icmp6->icmp6_code = code; icmp6->icmp6_pptr = htonl((u_int32_t)param); /* * icmp6_reflect() is designed to be in the input path. * icmp6_error() can be called from both input and outut path, * and if we are in output path rcvif could contain bogus value. * clear m->m_pkthdr.rcvif for safety, we should have enough scope * information in ip header (nip6). */ m->m_pkthdr.rcvif = NULL; icmp6stat.icp6s_outhist[type]++; icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */ return; freeit: /* * If we can't tell wheter or not we can generate ICMP6, free it. */ m_freem(m); } /* * Process a received ICMP6 message. */ int icmp6_input(mp, offp, proto) struct mbuf **mp; int *offp, proto; { struct mbuf *m = *mp, *n; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; int off = *offp; int icmp6len = m->m_pkthdr.len - *offp; int code, sum, noff; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE); /* m might change if M_LOOP. So, call mtod after this */ #endif /* * Locate icmp6 structure in mbuf, and check * that not corrupted and of at least minimum length */ ip6 = mtod(m, struct ip6_hdr *); if (icmp6len < sizeof(struct icmp6_hdr)) { icmp6stat.icp6s_tooshort++; goto freeit; } /* * calculate the checksum */ #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { icmp6stat.icp6s_tooshort++; return IPPROTO_DONE; } #endif code = icmp6->icmp6_code; if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) { nd6log((LOG_ERR, "ICMP6 checksum error(%d|%x) %s\n", icmp6->icmp6_type, sum, ip6_sprintf(&ip6->ip6_src))); icmp6stat.icp6s_checksum++; goto freeit; } if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) { /* * Deliver very specific ICMP6 type only. * This is important to deilver TOOBIG. Otherwise PMTUD * will not work. */ switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: case ICMP6_PACKET_TOO_BIG: case ICMP6_TIME_EXCEEDED: break; default: goto freeit; } } icmp6stat.icp6s_inhist[icmp6->icmp6_type]++; icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_msg); if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_error); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_dstunreach); switch (code) { case ICMP6_DST_UNREACH_NOROUTE: code = PRC_UNREACH_NET; break; case ICMP6_DST_UNREACH_ADMIN: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_adminprohib); code = PRC_UNREACH_PROTOCOL; /* is this a good code? */ break; case ICMP6_DST_UNREACH_ADDR: code = PRC_HOSTDEAD; break; #ifdef COMPAT_RFC1885 case ICMP6_DST_UNREACH_NOTNEIGHBOR: code = PRC_UNREACH_SRCFAIL; break; #else case ICMP6_DST_UNREACH_BEYONDSCOPE: /* I mean "source address was incorrect." */ code = PRC_PARAMPROB; break; #endif case ICMP6_DST_UNREACH_NOPORT: code = PRC_UNREACH_PORT; break; default: goto badcode; } goto deliver; break; case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_pkttoobig); if (code != 0) goto badcode; code = PRC_MSGSIZE; /* * Updating the path MTU will be done after examining * intermediate extension headers. */ goto deliver; break; case ICMP6_TIME_EXCEEDED: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_timeexceed); switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: case ICMP6_TIME_EXCEED_REASSEMBLY: code += PRC_TIMXCEED_INTRANS; break; default: goto badcode; } goto deliver; break; case ICMP6_PARAM_PROB: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_paramprob); switch (code) { case ICMP6_PARAMPROB_NEXTHEADER: code = PRC_UNREACH_PROTOCOL; break; case ICMP6_PARAMPROB_HEADER: case ICMP6_PARAMPROB_OPTION: code = PRC_PARAMPROB; break; default: goto badcode; } goto deliver; break; case ICMP6_ECHO_REQUEST: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echo); if (code != 0) goto badcode; if ((n = m_copy(m, 0, M_COPYALL)) == NULL) { /* Give up remote */ break; } if ((n->m_flags & M_EXT) != 0 || n->m_len < off + sizeof(struct icmp6_hdr)) { struct mbuf *n0 = n; const int maxlen = sizeof(*nip6) + sizeof(*nicmp6); /* * Prepare an internal mbuf. m_pullup() doesn't * always copy the length we specified. */ if (maxlen >= MCLBYTES) { /* Give up remote */ m_freem(n0); break; } MGETHDR(n, M_DONTWAIT, n0->m_type); if (n && maxlen >= MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (n == NULL) { /* Give up remote */ m_freem(n0); break; } M_COPY_PKTHDR(n, n0); /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); noff = sizeof(struct ip6_hdr); n->m_pkthdr.len = n->m_len = noff + sizeof(struct icmp6_hdr); /* * Adjust mbuf. ip6_plen will be adjusted in * ip6_output(). */ m_adj(n0, off + sizeof(struct icmp6_hdr)); n->m_pkthdr.len += n0->m_pkthdr.len; n->m_next = n0; n0->m_flags &= ~M_PKTHDR; } else { nip6 = mtod(n, struct ip6_hdr *); nicmp6 = (struct icmp6_hdr *)((caddr_t)nip6 + off); noff = off; } nicmp6->icmp6_type = ICMP6_ECHO_REPLY; nicmp6->icmp6_code = 0; if (n) { icmp6stat.icp6s_reflect++; icmp6stat.icp6s_outhist[ICMP6_ECHO_REPLY]++; icmp6_reflect(n, noff); } break; case ICMP6_ECHO_REPLY: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echoreply); if (code != 0) goto badcode; break; case MLD_LISTENER_QUERY: case MLD_LISTENER_REPORT: if (icmp6len < sizeof(struct mld_hdr)) goto badlen; if (icmp6->icmp6_type == MLD_LISTENER_QUERY) /* XXX: ugly... */ icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldquery); else icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldreport); if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ mld6_input(m, off); m = NULL; goto freeit; } mld6_input(n, off); /* m stays. */ break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mlddone); if (icmp6len < sizeof(struct mld_hdr)) /* necessary? */ goto badlen; break; /* nothing to be done in kernel */ case MLD_MTRACE_RESP: case MLD_MTRACE: /* XXX: these two are experimental. not officially defind. */ /* XXX: per-interface statistics? */ break; /* just pass it to applications */ case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */ { enum { WRU, FQDN } mode; if (!icmp6_nodeinfo) break; if (icmp6len == sizeof(struct icmp6_hdr) + 4) mode = WRU; else if (icmp6len >= sizeof(struct icmp6_nodeinfo)) mode = FQDN; else goto badlen; #define hostnamelen strlen(hostname) if (mode == FQDN) { #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo), IPPROTO_DONE); #endif n = m_copy(m, 0, M_COPYALL); if (n) n = ni6_input(n, off); /* XXX meaningless if n == NULL */ noff = sizeof(struct ip6_hdr); } else { u_char *p; int maxlen, maxhlen; if ((icmp6_nodeinfo & 5) != 5) break; if (code != 0) goto badcode; maxlen = sizeof(*nip6) + sizeof(*nicmp6) + 4; if (maxlen >= MCLBYTES) { /* Give up remote */ break; } MGETHDR(n, M_DONTWAIT, m->m_type); if (n && maxlen > MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (n == NULL) { /* Give up remote */ break; } n->m_pkthdr.rcvif = NULL; n->m_len = 0; maxhlen = M_TRAILINGSPACE(n) - maxlen; if (maxhlen > hostnamelen) maxhlen = hostnamelen; /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); bzero(p, 4); bcopy(hostname, p + 4, maxhlen); /* meaningless TTL */ noff = sizeof(struct ip6_hdr); M_COPY_PKTHDR(n, m); /* just for rcvif */ n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + 4 + maxhlen; nicmp6->icmp6_type = ICMP6_WRUREPLY; nicmp6->icmp6_code = 0; } #undef hostnamelen if (n) { icmp6stat.icp6s_reflect++; icmp6stat.icp6s_outhist[ICMP6_WRUREPLY]++; icmp6_reflect(n, noff); } break; } case ICMP6_WRUREPLY: if (code != 0) goto badcode; break; case ND_ROUTER_SOLICIT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routersolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_solicit)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_rs_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_rs_input(n, off, icmp6len); /* m stays. */ break; case ND_ROUTER_ADVERT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routeradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_advert)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_ra_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_ra_input(n, off, icmp6len); /* m stays. */ break; case ND_NEIGHBOR_SOLICIT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighborsolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_solicit)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_ns_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_ns_input(n, off, icmp6len); /* m stays. */ break; case ND_NEIGHBOR_ADVERT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighboradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_advert)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_na_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_na_input(n, off, icmp6len); /* m stays. */ break; case ND_REDIRECT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_redirect); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_redirect)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ icmp6_redirect_input(m, off); m = NULL; goto freeit; } icmp6_redirect_input(n, off); /* m stays. */ break; case ICMP6_ROUTER_RENUMBERING: if (code != ICMP6_ROUTER_RENUMBERING_COMMAND && code != ICMP6_ROUTER_RENUMBERING_RESULT) goto badcode; if (icmp6len < sizeof(struct icmp6_router_renum)) goto badlen; break; default: nd6log((LOG_DEBUG, "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n", icmp6->icmp6_type, ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst), m->m_pkthdr.rcvif ? m->m_pkthdr.rcvif->if_index : 0)); if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) { /* ICMPv6 error: MUST deliver it by spec... */ code = PRC_NCMDS; /* deliver */ } else { /* ICMPv6 informational: MUST not deliver */ break; } deliver: if (icmp6_notify_error(m, off, icmp6len, code)) { /* In this case, m should've been freed. */ return(IPPROTO_DONE); } break; badcode: icmp6stat.icp6s_badcode++; break; badlen: icmp6stat.icp6s_badlen++; break; } /* deliver the packet to appropriate sockets */ icmp6_rip6_input(&m, *offp); return IPPROTO_DONE; freeit: m_freem(m); return IPPROTO_DONE; } static int icmp6_notify_error(m, off, icmp6len, code) struct mbuf *m; int off, icmp6len; { struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; u_int32_t notifymtu; struct sockaddr_in6 icmp6src, icmp6dst; if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) { icmp6stat.icp6s_tooshort++; goto freeit; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr), -1); icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { icmp6stat.icp6s_tooshort++; return(-1); } #endif eip6 = (struct ip6_hdr *)(icmp6 + 1); /* Detect the upper level protocol */ { void (*ctlfunc) __P((int, struct sockaddr *, void *)); u_int8_t nxt = eip6->ip6_nxt; int eoff = off + sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr); struct ip6ctlparam ip6cp; struct in6_addr *finaldst = NULL; int icmp6type = icmp6->icmp6_type; struct ip6_frag *fh; struct ip6_rthdr *rth; struct ip6_rthdr0 *rth0; int rthlen; while (1) { /* XXX: should avoid infinite loop explicitly? */ struct ip6_ext *eh; switch (nxt) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_AH: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_ext), -1); eh = (struct ip6_ext *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(eh, struct ip6_ext *, m, eoff, sizeof(*eh)); if (eh == NULL) { icmp6stat.icp6s_tooshort++; return(-1); } #endif if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else eoff += (eh->ip6e_len + 1) << 3; nxt = eh->ip6e_nxt; break; case IPPROTO_ROUTING: /* * When the erroneous packet contains a * routing header, we should examine the * header to determine the final destination. * Otherwise, we can't properly update * information that depends on the final * destination (e.g. path MTU). */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth), -1); rth = (struct ip6_rthdr *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m, eoff, sizeof(*rth)); if (rth == NULL) { icmp6stat.icp6s_tooshort++; return(-1); } #endif rthlen = (rth->ip6r_len + 1) << 3; /* * XXX: currently there is no * officially defined type other * than type-0. * Note that if the segment left field * is 0, all intermediate hops must * have been passed. */ if (rth->ip6r_segleft && rth->ip6r_type == IPV6_RTHDR_TYPE_0) { int hops; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + rthlen, -1); rth0 = (struct ip6_rthdr0 *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth0, struct ip6_rthdr0 *, m, eoff, rthlen); if (rth0 == NULL) { icmp6stat.icp6s_tooshort++; return(-1); } #endif /* just ignore a bogus header */ if ((rth0->ip6r0_len % 2) == 0 && (hops = rth0->ip6r0_len/2)) finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1); } eoff += rthlen; nxt = rth->ip6r_nxt; break; case IPPROTO_FRAGMENT: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_frag), -1); fh = (struct ip6_frag *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(fh, struct ip6_frag *, m, eoff, sizeof(*fh)); if (fh == NULL) { icmp6stat.icp6s_tooshort++; return(-1); } #endif /* * Data after a fragment header is meaningless * unless it is the first fragment, but * we'll go to the notify label for path MTU * discovery. */ if (fh->ip6f_offlg & IP6F_OFF_MASK) goto notify; eoff += sizeof(struct ip6_frag); nxt = fh->ip6f_nxt; break; default: /* * This case includes ESP and the No Next * Header. In such cases going to the notify * label does not have any meaning * (i.e. ctlfunc will be NULL), but we go * anyway since we might have to update * path MTU information. */ goto notify; } } notify: #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { icmp6stat.icp6s_tooshort++; return(-1); } #endif eip6 = (struct ip6_hdr *)(icmp6 + 1); bzero(&icmp6dst, sizeof(icmp6dst)); icmp6dst.sin6_len = sizeof(struct sockaddr_in6); icmp6dst.sin6_family = AF_INET6; if (finaldst == NULL) icmp6dst.sin6_addr = eip6->ip6_dst; else icmp6dst.sin6_addr = *finaldst; icmp6dst.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif, &icmp6dst.sin6_addr); #ifndef SCOPEDROUTING if (in6_embedscope(&icmp6dst.sin6_addr, &icmp6dst, NULL, NULL)) { /* should be impossbile */ nd6log((LOG_DEBUG, "icmp6_notify_error: in6_embedscope failed\n")); goto freeit; } #endif /* * retrieve parameters from the inner IPv6 header, and convert * them into sockaddr structures. */ bzero(&icmp6src, sizeof(icmp6src)); icmp6src.sin6_len = sizeof(struct sockaddr_in6); icmp6src.sin6_family = AF_INET6; icmp6src.sin6_addr = eip6->ip6_src; icmp6src.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif, &icmp6src.sin6_addr); #ifndef SCOPEDROUTING if (in6_embedscope(&icmp6src.sin6_addr, &icmp6src, NULL, NULL)) { /* should be impossbile */ nd6log((LOG_DEBUG, "icmp6_notify_error: in6_embedscope failed\n")); goto freeit; } #endif icmp6src.sin6_flowinfo = (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); if (finaldst == NULL) finaldst = &eip6->ip6_dst; ip6cp.ip6c_m = m; ip6cp.ip6c_icmp6 = icmp6; ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1); ip6cp.ip6c_off = eoff; ip6cp.ip6c_finaldst = finaldst; ip6cp.ip6c_src = &icmp6src; ip6cp.ip6c_nxt = nxt; if (icmp6type == ICMP6_PACKET_TOO_BIG) { notifymtu = ntohl(icmp6->icmp6_mtu); ip6cp.ip6c_cmdarg = (void *)¬ifymtu; icmp6_mtudisc_update(&ip6cp, 1); /*XXX*/ } ctlfunc = (void (*) __P((int, struct sockaddr *, void *))) (inet6sw[ip6_protox[nxt]].pr_ctlinput); if (ctlfunc) { (void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst, &ip6cp); } } return(0); freeit: m_freem(m); return(-1); } void icmp6_mtudisc_update(ip6cp, validated) struct ip6ctlparam *ip6cp; int validated; { struct in6_addr *dst = ip6cp->ip6c_finaldst; struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */ u_int mtu = ntohl(icmp6->icmp6_mtu); struct rtentry *rt = NULL; struct sockaddr_in6 sin6; if (!validated) return; bzero(&sin6, sizeof(sin6)); sin6.sin6_family = PF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_addr = *dst; /* XXX normally, this won't happen */ if (IN6_IS_ADDR_LINKLOCAL(dst)) { sin6.sin6_addr.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); } /* sin6.sin6_scope_id = XXX: should be set if DST is a scoped addr */ rt = rtalloc1((struct sockaddr *)&sin6, 0, RTF_CLONING | RTF_PRCLONING); if (rt && (rt->rt_flags & RTF_HOST) && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { if (mtu < IPV6_MMTU) { /* xxx */ rt->rt_rmx.rmx_locks |= RTV_MTU; } else if (mtu < rt->rt_ifp->if_mtu && rt->rt_rmx.rmx_mtu > mtu) { icmp6stat.icp6s_pmtuchg++; rt->rt_rmx.rmx_mtu = mtu; } } if (rt) { /* XXX: need braces to avoid conflict with else in RTFREE. */ RTFREE(rt); } } /* * Process a Node Information Query packet, based on * draft-ietf-ipngwg-icmp-name-lookups-07. * * Spec incompatibilities: * - IPv6 Subject address handling * - IPv4 Subject address handling support missing * - Proxy reply (answer even if it's not for me) * - joins NI group address at in6_ifattach() time only, does not cope * with hostname changes by sethostname(3) */ #define hostnamelen strlen(hostname) static struct mbuf * ni6_input(m, off) struct mbuf *m; int off; { struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; u_int16_t qtype; int subjlen; int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); struct ni_reply_fqdn *fqdn; int addrs; /* for NI_QTYPE_NODEADDR */ struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */ struct sockaddr_in6 sin6; /* double meaning; ip6_dst and subjectaddr */ struct sockaddr_in6 sin6_d; /* XXX: we should retrieve this from m_aux */ struct ip6_hdr *ip6; int oldfqdn = 0; /* if 1, return pascal string (03 draft) */ char *subj = NULL; struct in6_ifaddr *ia6 = NULL; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6)); if (ni6 == NULL) { /* m is already reclaimed */ return NULL; } #endif /* * Validate IPv6 destination address. * * The Responder must discard the Query without further processing * unless it is one of the Responder's unicast or anycast addresses, or * a link-local scope multicast address which the Responder has joined. * [icmp-name-lookups-07, Section 4.] */ bzero(&sin6, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); bcopy(&ip6->ip6_dst, &sin6.sin6_addr, sizeof(sin6.sin6_addr)); /* XXX scopeid */ if ((ia6 = (struct in6_ifaddr *)ifa_ifwithaddr((struct sockaddr *)&sin6)) != NULL) { /* unicast/anycast, fine */ if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (icmp6_nodeinfo & 4) == 0) { nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); goto bad; } } else if (IN6_IS_ADDR_MC_LINKLOCAL(&sin6.sin6_addr)) ; /* link-local multicast, fine */ else goto bad; /* validate query Subject field. */ qtype = ntohs(ni6->ni_qtype); subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo); switch (qtype) { case NI_QTYPE_NOOP: case NI_QTYPE_SUPTYPES: /* 07 draft */ if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0) break; /* FALLTHROUGH */ case NI_QTYPE_FQDN: case NI_QTYPE_NODEADDR: switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: #if ICMP6_NI_SUBJ_IPV6 != 0 case 0: #endif /* * backward compatibility - try to accept 03 draft * format, where no Subject is present. */ if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 && subjlen == 0) { oldfqdn++; break; } #if ICMP6_NI_SUBJ_IPV6 != 0 if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6) goto bad; #endif if (subjlen != sizeof(sin6.sin6_addr)) goto bad; /* * Validate Subject address. * * Not sure what exactly "address belongs to the node" * means in the spec, is it just unicast, or what? * * At this moment we consider Subject address as * "belong to the node" if the Subject address equals * to the IPv6 destination address; validation for * IPv6 destination address should have done enough * check for us. * * We do not do proxy at this moment. */ /* m_pulldown instead of copy? */ m_copydata(m, off + sizeof(struct icmp6_nodeinfo), subjlen, (caddr_t)&sin6.sin6_addr); sin6.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif, &sin6.sin6_addr); #ifndef SCOPEDROUTING in6_embedscope(&sin6.sin6_addr, &sin6, NULL, NULL); #endif bzero(&sin6_d, sizeof(sin6_d)); sin6_d.sin6_family = AF_INET6; /* not used, actually */ sin6_d.sin6_len = sizeof(sin6_d); /* ditto */ sin6_d.sin6_addr = ip6->ip6_dst; sin6_d.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif, &ip6->ip6_dst); #ifndef SCOPEDROUTING in6_embedscope(&sin6_d.sin6_addr, &sin6_d, NULL, NULL); #endif subj = (char *)&sin6; if (SA6_ARE_ADDR_EQUAL(&sin6, &sin6_d)) break; /* * XXX if we are to allow other cases, we should really * be careful about scope here. * basically, we should disallow queries toward IPv6 * destination X with subject Y, if scope(X) > scope(Y). * if we allow scope(X) > scope(Y), it will result in * information leakage across scope boundary. */ goto bad; case ICMP6_NI_SUBJ_FQDN: /* * Validate Subject name with gethostname(3). * * The behavior may need some debate, since: * - we are not sure if the node has FQDN as * hostname (returned by gethostname(3)). * - the code does wildcard match for truncated names. * however, we are not sure if we want to perform * wildcard match, if gethostname(3) side has * truncated hostname. */ n = ni6_nametodns(hostname, hostnamelen, 0); if (!n || n->m_next || n->m_len == 0) goto bad; IP6_EXTHDR_GET(subj, char *, m, off + sizeof(struct icmp6_nodeinfo), subjlen); if (subj == NULL) goto bad; if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *), n->m_len)) { goto bad; } m_freem(n); n = NULL; break; case ICMP6_NI_SUBJ_IPV4: /* XXX: to be implemented? */ default: goto bad; } break; } /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: if ((icmp6_nodeinfo & 1) == 0) goto bad; break; case NI_QTYPE_NODEADDR: if ((icmp6_nodeinfo & 2) == 0) goto bad; break; } /* guess reply length */ switch (qtype) { case NI_QTYPE_NOOP: break; /* no reply data */ case NI_QTYPE_SUPTYPES: replylen += sizeof(u_int32_t); break; case NI_QTYPE_FQDN: /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); break; case NI_QTYPE_NODEADDR: addrs = ni6_addrs(ni6, m, &ifp, subj); if ((replylen += addrs * (sizeof(struct in6_addr) + sizeof(u_int32_t))) > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ break; default: /* * XXX: We must return a reply with the ICMP6 code * `unknown Qtype' in this case. However we regard the case * as an FQDN query for backward compatibility. * Older versions set a random value to this field, * so it rarely varies in the defined qtypes. * But the mechanism is not reliable... * maybe we should obsolete older versions. */ qtype = NI_QTYPE_FQDN; /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); oldfqdn++; break; } /* allocate an mbuf to reply. */ MGETHDR(n, M_DONTWAIT, m->m_type); if (n == NULL) { m_freem(m); return(NULL); } M_COPY_PKTHDR(n, m); /* just for recvif */ if (replylen > MHLEN) { if (replylen > MCLBYTES) { /* * XXX: should we try to allocate more? But MCLBYTES * is probably much larger than IPV6_MMTU... */ goto bad; } MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { goto bad; } } n->m_pkthdr.len = n->m_len = replylen; /* copy mbuf header and IPv6 + Node Information base headers */ bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr)); nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1); bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo)); /* qtype dependent procedure */ switch (qtype) { case NI_QTYPE_NOOP: nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = 0; break; case NI_QTYPE_SUPTYPES: { u_int32_t v; nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = htons(0x0000); /* raw bitmap */ /* supports NOOP, SUPTYPES, FQDN, and NODEADDR */ v = (u_int32_t)htonl(0x0000000f); bcopy(&v, nni6 + 1, sizeof(u_int32_t)); break; } case NI_QTYPE_FQDN: nni6->ni_code = ICMP6_NI_SUCCESS; fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) + sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo)); nni6->ni_flags = 0; /* XXX: meaningless TTL */ fqdn->ni_fqdn_ttl = 0; /* ditto. */ /* * XXX do we really have FQDN in variable "hostname"? */ n->m_next = ni6_nametodns(hostname, hostnamelen, oldfqdn); if (n->m_next == NULL) goto bad; /* XXX we assume that n->m_next is not a chain */ if (n->m_next->m_next != NULL) goto bad; n->m_pkthdr.len += n->m_next->m_len; break; case NI_QTYPE_NODEADDR: { int lenlim, copied; nni6->ni_code = ICMP6_NI_SUCCESS; n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); lenlim = M_TRAILINGSPACE(n); copied = ni6_store_addrs(ni6, nni6, ifp, lenlim); /* XXX: reset mbuf length */ n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo) + copied; break; } default: break; /* XXX impossible! */ } nni6->ni_type = ICMP6_NI_REPLY; m_freem(m); return(n); bad: m_freem(m); if (n) m_freem(n); return(NULL); } #undef hostnamelen /* * make a mbuf with DNS-encoded string. no compression support. * * XXX names with less than 2 dots (like "foo" or "foo.section") will be * treated as truncated name (two \0 at the end). this is a wild guess. */ static struct mbuf * ni6_nametodns(name, namelen, old) const char *name; int namelen; int old; /* return pascal string if non-zero */ { struct mbuf *m; char *cp, *ep; const char *p, *q; int i, len, nterm; if (old) len = namelen + 1; else len = MCLBYTES; /* because MAXHOSTNAMELEN is usually 256, we use cluster mbuf */ MGET(m, M_DONTWAIT, MT_DATA); if (m && len > MLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) goto fail; } if (!m) goto fail; m->m_next = NULL; if (old) { m->m_len = len; *mtod(m, char *) = namelen; bcopy(name, mtod(m, char *) + 1, namelen); return m; } else { m->m_len = 0; cp = mtod(m, char *); ep = mtod(m, char *) + M_TRAILINGSPACE(m); /* if not certain about my name, return empty buffer */ if (namelen == 0) return m; /* * guess if it looks like shortened hostname, or FQDN. * shortened hostname needs two trailing "\0". */ i = 0; for (p = name; p < name + namelen; p++) { if (*p && *p == '.') i++; } if (i < 2) nterm = 2; else nterm = 1; p = name; while (cp < ep && p < name + namelen) { i = 0; for (q = p; q < name + namelen && *q && *q != '.'; q++) i++; /* result does not fit into mbuf */ if (cp + i + 1 >= ep) goto fail; /* * DNS label length restriction, RFC1035 page 8. * "i == 0" case is included here to avoid returning * 0-length label on "foo..bar". */ if (i <= 0 || i >= 64) goto fail; *cp++ = i; bcopy(p, cp, i); cp += i; p = q; if (p < name + namelen && *p == '.') p++; } /* termination */ if (cp + nterm >= ep) goto fail; while (nterm-- > 0) *cp++ = '\0'; m->m_len = cp - mtod(m, char *); return m; } panic("should not reach here"); /* NOTREACHED */ fail: if (m) m_freem(m); return NULL; } /* * check if two DNS-encoded string matches. takes care of truncated * form (with \0\0 at the end). no compression support. * XXX upper/lowercase match (see RFC2065) */ static int ni6_dnsmatch(a, alen, b, blen) const char *a; int alen; const char *b; int blen; { const char *a0, *b0; int l; /* simplest case - need validation? */ if (alen == blen && bcmp(a, b, alen) == 0) return 1; a0 = a; b0 = b; /* termination is mandatory */ if (alen < 2 || blen < 2) return 0; if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0') return 0; alen--; blen--; while (a - a0 < alen && b - b0 < blen) { if (a - a0 + 1 > alen || b - b0 + 1 > blen) return 0; if ((signed char)a[0] < 0 || (signed char)b[0] < 0) return 0; /* we don't support compression yet */ if (a[0] >= 64 || b[0] >= 64) return 0; /* truncated case */ if (a[0] == 0 && a - a0 == alen - 1) return 1; if (b[0] == 0 && b - b0 == blen - 1) return 1; if (a[0] == 0 || b[0] == 0) return 0; if (a[0] != b[0]) return 0; l = a[0]; if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen) return 0; if (bcmp(a + 1, b + 1, l) != 0) return 0; a += 1 + l; b += 1 + l; } if (a - a0 == alen && b - b0 == blen) return 1; else return 0; } /* * calculate the number of addresses to be returned in the node info reply. */ static int ni6_addrs(ni6, m, ifpp, subj) struct icmp6_nodeinfo *ni6; struct mbuf *m; struct ifnet **ifpp; char *subj; { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; struct sockaddr_in6 *subj_ip6 = NULL; /* XXX pedant */ int addrs = 0, addrsofif, iffound = 0; int niflags = ni6->ni_flags; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) { switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: if (subj == NULL) /* must be impossible... */ return(0); subj_ip6 = (struct sockaddr_in6 *)subj; break; default: /* * XXX: we only support IPv6 subject address for * this Qtype. */ return(0); } } for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { addrsofif = 0; TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 && IN6_ARE_ADDR_EQUAL(&subj_ip6->sin6_addr, &ifa6->ia_addr.sin6_addr)) iffound = 1; /* * IPv4-mapped addresses can only be returned by a * Node Information proxy, since they represent * addresses of IPv4-only nodes, which perforce do * not implement this protocol. * [icmp-name-lookups-07, Section 5.4] * So we don't support NI_NODEADDR_FLAG_COMPAT in * this function at this moment. */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; /* we need only unicast addresses */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (icmp6_nodeinfo & 4) == 0) { continue; } addrsofif++; /* count the address */ } if (iffound) { *ifpp = ifp; return(addrsofif); } addrs += addrsofif; } return(addrs); } static int ni6_store_addrs(ni6, nni6, ifp0, resid) struct icmp6_nodeinfo *ni6, *nni6; struct ifnet *ifp0; int resid; { struct ifnet *ifp = ifp0 ? ifp0 : TAILQ_FIRST(&ifnet); struct in6_ifaddr *ifa6; struct ifaddr *ifa; struct ifnet *ifp_dep = NULL; int copied = 0, allow_deprecated = 0; u_char *cp = (u_char *)(nni6 + 1); int niflags = ni6->ni_flags; u_int32_t ltime; if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) return(0); /* needless to copy */ again: for (; ifp; ifp = TAILQ_NEXT(ifp, if_list)) { for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 && allow_deprecated == 0) { /* * prefererred address should be put before * deprecated addresses. */ /* record the interface for later search */ if (ifp_dep == NULL) ifp_dep = ifp; continue; } else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && allow_deprecated != 0) continue; /* we now collect deprecated addrs */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (icmp6_nodeinfo & 4) == 0) { continue; } /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { /* * We give up much more copy. * Set the truncate flag and return. */ nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE; return(copied); } /* * Set the TTL of the address. * The TTL value should be one of the following * according to the specification: * * 1. The remaining lifetime of a DHCP lease on the * address, or * 2. The remaining Valid Lifetime of a prefix from * which the address was derived through Stateless * Autoconfiguration. * * Note that we currently do not support stateful * address configuration by DHCPv6, so the former * case can't happen. */ if (ifa6->ia6_lifetime.ia6t_expire == 0) ltime = ND6_INFINITE_LIFETIME; else { if (ifa6->ia6_lifetime.ia6t_expire > time_second) ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_second); else ltime = 0; } bcopy(<ime, cp, sizeof(u_int32_t)); cp += sizeof(u_int32_t); /* copy the address itself */ bcopy(&ifa6->ia_addr.sin6_addr, cp, sizeof(struct in6_addr)); /* XXX: KAME link-local hack; remove ifindex */ if (IN6_IS_ADDR_LINKLOCAL(&ifa6->ia_addr.sin6_addr)) ((struct in6_addr *)cp)->s6_addr16[1] = 0; cp += sizeof(struct in6_addr); resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); } if (ifp0) /* we need search only on the specified IF */ break; } if (allow_deprecated == 0 && ifp_dep != NULL) { ifp = ifp_dep; allow_deprecated = 1; goto again; } return(copied); } /* * XXX almost dup'ed code with rip6_input. */ static int icmp6_rip6_input(mp, off) struct mbuf **mp; int off; { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct in6pcb *in6p; struct in6pcb *last = NULL; struct sockaddr_in6 rip6src; struct icmp6_hdr *icmp6; struct mbuf *opts = NULL; #ifndef PULLDOWN_TEST /* this is assumed to be safe. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { /* m is already reclaimed */ return IPPROTO_DONE; } #endif bzero(&rip6src, sizeof(rip6src)); rip6src.sin6_len = sizeof(struct sockaddr_in6); rip6src.sin6_family = AF_INET6; /* KAME hack: recover scopeid */ (void)in6_recoverscope(&rip6src, &ip6->ip6_src, m->m_pkthdr.rcvif); LIST_FOREACH(in6p, &ripcb, inp_list) { if ((in6p->inp_vflag & INP_IPV6) == 0) continue; #ifdef HAVE_NRL_INPCB if (!(in6p->in6p_flags & INP_IPV6)) continue; #endif if (in6p->in6p_ip6_nxt != IPPROTO_ICMPV6) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) continue; if (in6p->in6p_icmp6filt && ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, in6p->in6p_icmp6filt)) continue; if (last) { struct mbuf *n; if ((n = m_copy(m, 0, (int)M_COPYALL)) != NULL) { if (last->in6p_flags & IN6P_CONTROLOPTS) ip6_savecontrol(last, &opts, ip6, n); /* strip intermediate headers */ m_adj(n, off); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&rip6src, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) { m_freem(opts); } } else sorwakeup(last->in6p_socket); opts = NULL; } } last = in6p; } if (last) { if (last->in6p_flags & IN6P_CONTROLOPTS) ip6_savecontrol(last, &opts, ip6, m); /* strip intermediate headers */ m_adj(m, off); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&rip6src, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); } else sorwakeup(last->in6p_socket); } else { m_freem(m); ip6stat.ip6s_delivered--; } return IPPROTO_DONE; } /* * Reflect the ip6 packet back to the source. * OFF points to the icmp6 header, counted from the top of the mbuf. */ void icmp6_reflect(m, off) struct mbuf *m; size_t off; { struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; struct in6_ifaddr *ia; struct in6_addr t, *src = 0; int plen; int type, code; struct ifnet *outif = NULL; struct sockaddr_in6 sa6_src, sa6_dst; #ifdef COMPAT_RFC1885 int mtu = IPV6_MMTU; struct sockaddr_in6 *sin6 = &icmp6_reflect_rt.ro_dst; #endif /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { nd6log((LOG_DEBUG, "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n", (u_long)off, (u_long)sizeof(struct ip6_hdr), __FILE__, __LINE__)); goto bad; } /* * If there are extra headers between IPv6 and ICMPv6, strip * off that header first. */ #ifdef DIAGNOSTIC if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN) panic("assumption failed in icmp6_reflect"); #endif if (off > sizeof(struct ip6_hdr)) { size_t l; struct ip6_hdr nip6; l = off - sizeof(struct ip6_hdr); m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6); m_adj(m, l); l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6)); } else /* off == sizeof(struct ip6_hdr) */ { size_t l; l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } } plen = m->m_pkthdr.len - sizeof(struct ip6_hdr); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_nxt = IPPROTO_ICMPV6; icmp6 = (struct icmp6_hdr *)(ip6 + 1); type = icmp6->icmp6_type; /* keep type for statistics */ code = icmp6->icmp6_code; /* ditto. */ t = ip6->ip6_dst; /* * ip6_input() drops a packet if its src is multicast. * So, the src is never multicast. */ ip6->ip6_dst = ip6->ip6_src; /* * XXX: make sure to embed scope zone information, using * already embedded IDs or the received interface (if any). * Note that rcvif may be NULL. * TODO: scoped routing case (XXX). */ bzero(&sa6_src, sizeof(sa6_src)); sa6_src.sin6_family = AF_INET6; sa6_src.sin6_len = sizeof(sa6_src); sa6_src.sin6_addr = ip6->ip6_dst; in6_recoverscope(&sa6_src, &ip6->ip6_dst, m->m_pkthdr.rcvif); in6_embedscope(&ip6->ip6_dst, &sa6_src, NULL, NULL); bzero(&sa6_dst, sizeof(sa6_dst)); sa6_dst.sin6_family = AF_INET6; sa6_dst.sin6_len = sizeof(sa6_dst); sa6_dst.sin6_addr = t; in6_recoverscope(&sa6_dst, &t, m->m_pkthdr.rcvif); in6_embedscope(&t, &sa6_dst, NULL, NULL); #ifdef COMPAT_RFC1885 /* * xxx guess MTU * RFC 1885 requires that echo reply should be truncated if it * does not fit in with (return) path MTU, but the description was * removed in the new spec. */ if (icmp6_reflect_rt.ro_rt == 0 || ! (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &ip6->ip6_dst))) { if (icmp6_reflect_rt.ro_rt) { RTFREE(icmp6_reflect_rt.ro_rt); icmp6_reflect_rt.ro_rt = 0; } bzero(sin6, sizeof(*sin6)); sin6->sin6_family = PF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_addr = ip6->ip6_dst; rtalloc_ign((struct route *)&icmp6_reflect_rt.ro_rt, RTF_PRCLONING); } if (icmp6_reflect_rt.ro_rt == 0) goto bad; if ((icmp6_reflect_rt.ro_rt->rt_flags & RTF_HOST) && mtu < icmp6_reflect_rt.ro_rt->rt_ifp->if_mtu) mtu = icmp6_reflect_rt.ro_rt->rt_rmx.rmx_mtu; if (mtu < m->m_pkthdr.len) { plen -= (m->m_pkthdr.len - mtu); m_adj(m, mtu - m->m_pkthdr.len); } #endif /* * If the incoming packet was addressed directly to us(i.e. unicast), * use dst as the src for the reply. * The IN6_IFF_NOTREADY case would be VERY rare, but is possible * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. */ for (ia = in6_ifaddr; ia; ia = ia->ia_next) if (IN6_ARE_ADDR_EQUAL(&t, &ia->ia_addr.sin6_addr) && (ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY)) == 0) { src = &t; break; } if (ia == NULL && IN6_IS_ADDR_LINKLOCAL(&t) && (m->m_flags & M_LOOP)) { /* * This is the case if the dst is our link-local address * and the sender is also ourselves. */ src = &t; } if (src == 0) { int e; struct route_in6 ro; /* * This case matches to multicasts, our anycast, or unicasts * that we do not own. Select a source address based on the * source address of the erroneous packet. */ bzero(&ro, sizeof(ro)); src = in6_selectsrc(&sa6_src, NULL, NULL, &ro, NULL, &e); if (ro.ro_rt) RTFREE(ro.ro_rt); /* XXX: we could use this */ if (src == NULL) { nd6log((LOG_DEBUG, "icmp6_reflect: source can't be determined: " "dst=%s, error=%d\n", ip6_sprintf(&sa6_src.sin6_addr), e)); goto bad; } } ip6->ip6_src = *src; ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; if (m->m_pkthdr.rcvif) { /* XXX: This may not be the outgoing interface */ ip6->ip6_hlim = nd_ifinfo[m->m_pkthdr.rcvif->if_index].chlim; } else ip6->ip6_hlim = ip6_defhlim; icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), plen); /* * XXX option handling */ m->m_flags &= ~(M_BCAST|M_MCAST); -#ifdef IPSEC - /* Don't lookup socket */ - (void)ipsec_setsocket(m, NULL); -#endif /*IPSEC*/ #ifdef COMPAT_RFC1885 - ip6_output(m, NULL, &icmp6_reflect_rt, 0, NULL, &outif); + ip6_output(m, NULL, &icmp6_reflect_rt, 0, NULL, &outif, NULL); #else - ip6_output(m, NULL, NULL, 0, NULL, &outif); + ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); #endif if (outif) icmp6_ifoutstat_inc(outif, type, code); return; bad: m_freem(m); return; } void icmp6_fasttimo() { mld6_fasttimeo(); } static const char * icmp6_redirect_diag(src6, dst6, tgt6) struct in6_addr *src6; struct in6_addr *dst6; struct in6_addr *tgt6; { static char buf[1024]; snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)", ip6_sprintf(src6), ip6_sprintf(dst6), ip6_sprintf(tgt6)); return buf; } void icmp6_redirect_input(m, off) struct mbuf *m; int off; { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_redirect *nd_rd; int icmp6len = ntohs(ip6->ip6_plen); char *lladdr = NULL; int lladdrlen = 0; u_char *redirhdr = NULL; int redirhdrlen = 0; struct rtentry *rt = NULL; int is_router; int is_onlink; struct in6_addr src6 = ip6->ip6_src; struct in6_addr redtgt6; struct in6_addr reddst6; union nd_opts ndopts; if (!m || !ifp) return; /* XXX if we are router, we don't update route by icmp6 redirect */ if (ip6_forwarding) goto freeit; if (!icmp6_rediraccept) goto freeit; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len); if (nd_rd == NULL) { icmp6stat.icp6s_tooshort++; return; } #endif redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) redtgt6.s6_addr16[1] = htons(ifp->if_index); if (IN6_IS_ADDR_LINKLOCAL(&reddst6)) reddst6.s6_addr16[1] = htons(ifp->if_index); /* validation */ if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "must be from linklocal\n", ip6_sprintf(&src6))); goto bad; } if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "hlim=%d (must be 255)\n", ip6_sprintf(&src6), ip6->ip6_hlim)); goto bad; } { /* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */ struct sockaddr_in6 sin6; struct in6_addr *gw6; bzero(&sin6, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sin6.sin6_addr, sizeof(reddst6)); rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL); if (rt) { if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6) { nd6log((LOG_ERR, "ICMP6 redirect rejected; no route " "with inet6 gateway found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); RTFREE(rt); goto bad; } gw6 = &(((struct sockaddr_in6 *)rt->rt_gateway)->sin6_addr); if (bcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "not equal to gw-for-src=%s (must be same): " "%s\n", ip6_sprintf(gw6), icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); RTFREE(rt); goto bad; } } else { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "no route found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } RTFREE(rt); rt = NULL; } if (IN6_IS_ADDR_MULTICAST(&reddst6)) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "redirect dst must be unicast: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } is_router = is_onlink = 0; if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) is_router = 1; /* router case */ if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0) is_onlink = 1; /* on-link destination case */ if (!is_router && !is_onlink) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "neither router case nor onlink case: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* validation passed */ icmp6len -= sizeof(*nd_rd); nd6_option_init(nd_rd + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "icmp6_redirect_input: " "invalid ND option, rejected: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } if (ndopts.nd_opts_rh) { redirhdrlen = ndopts.nd_opts_rh->nd_opt_rh_len; redirhdr = (u_char *)(ndopts.nd_opts_rh + 1); /* xxx */ } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "icmp6_redirect_input: lladdrlen mismatch for %s " "(if %d, icmp6 packet %d): %s\n", ip6_sprintf(&redtgt6), ifp->if_addrlen, lladdrlen - 2, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* RFC 2461 8.3 */ nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT, is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER); if (!is_onlink) { /* better router case. perform rtredirect. */ /* perform rtredirect */ struct sockaddr_in6 sdst; struct sockaddr_in6 sgw; struct sockaddr_in6 ssrc; bzero(&sdst, sizeof(sdst)); bzero(&sgw, sizeof(sgw)); bzero(&ssrc, sizeof(ssrc)); sdst.sin6_family = sgw.sin6_family = ssrc.sin6_family = AF_INET6; sdst.sin6_len = sgw.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6); bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr)); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr)); rtredirect((struct sockaddr *)&sdst, (struct sockaddr *)&sgw, (struct sockaddr *)NULL, RTF_GATEWAY | RTF_HOST, (struct sockaddr *)&ssrc, (struct rtentry **)NULL); } /* finally update cached route in each socket via pfctlinput */ { struct sockaddr_in6 sdst; bzero(&sdst, sizeof(sdst)); sdst.sin6_family = AF_INET6; sdst.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst); #ifdef IPSEC key_sa_routechange((struct sockaddr *)&sdst); #endif } freeit: m_freem(m); return; bad: icmp6stat.icp6s_badredirect++; m_freem(m); } void icmp6_redirect_output(m0, rt) struct mbuf *m0; struct rtentry *rt; { struct ifnet *ifp; /* my outgoing interface */ struct in6_addr *ifp_ll6; struct in6_addr *router_ll6; struct ip6_hdr *sip6; /* m0 as struct ip6_hdr */ struct mbuf *m = NULL; /* newly allocated one */ struct ip6_hdr *ip6; /* m as struct ip6_hdr */ struct nd_redirect *nd_rd; size_t maxlen; u_char *p; struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; icmp6_errcount(&icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0); /* if we are not router, we don't send icmp6 redirect */ if (!ip6_forwarding || ip6_accept_rtadv) goto fail; /* sanity check */ if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp)) goto fail; /* * Address check: * the source address must identify a neighbor, and * the destination address must not be a multicast address * [RFC 2461, sec 8.2] */ sip6 = mtod(m0, struct ip6_hdr *); bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = sip6->ip6_src; /* we don't currently use sin6_scope_id, but eventually use it */ src_sa.sin6_scope_id = in6_addr2scopeid(ifp, &sip6->ip6_src); if (nd6_is_addr_neighbor(&src_sa, ifp) == 0) goto fail; if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst)) goto fail; /* what should we do here? */ /* rate limit */ if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0)) goto fail; /* * Since we are going to append up to 1280 bytes (= IPV6_MMTU), * we almost always ask for an mbuf cluster for simplicity. * (MHLEN < IPV6_MMTU is almost always true) */ #if IPV6_MMTU >= MCLBYTES # error assumption failed about IPV6_MMTU and MCLBYTES #endif MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m && IPV6_MMTU >= MHLEN) MCLGET(m, M_DONTWAIT); if (!m) goto fail; m->m_pkthdr.rcvif = NULL; m->m_len = 0; maxlen = M_TRAILINGSPACE(m); maxlen = min(IPV6_MMTU, maxlen); /* just for safety */ if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) { goto fail; } { /* get ip6 linklocal address for ifp(my outgoing interface). */ struct in6_ifaddr *ia; if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) goto fail; ifp_ll6 = &ia->ia_addr.sin6_addr; } /* get ip6 linklocal address for the router. */ if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)rt->rt_gateway; router_ll6 = &sin6->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(router_ll6)) router_ll6 = (struct in6_addr *)NULL; } else router_ll6 = (struct in6_addr *)NULL; /* ip6 */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; /* ip6->ip6_src must be linklocal addr for my outgoing if. */ bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr)); /* ND Redirect */ nd_rd = (struct nd_redirect *)(ip6 + 1); nd_rd->nd_rd_type = ND_REDIRECT; nd_rd->nd_rd_code = 0; nd_rd->nd_rd_reserved = 0; if (rt->rt_flags & RTF_GATEWAY) { /* * nd_rd->nd_rd_target must be a link-local address in * better router cases. */ if (!router_ll6) goto fail; bcopy(router_ll6, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } else { /* make sure redtgt == reddst */ bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } p = (u_char *)(nd_rd + 1); if (!router_ll6) goto nolladdropt; { /* target lladdr option */ struct rtentry *rt_router = NULL; int len; struct sockaddr_dl *sdl; struct nd_opt_hdr *nd_opt; char *lladdr; rt_router = nd6_lookup(router_ll6, 0, ifp); if (!rt_router) goto nolladdropt; len = sizeof(*nd_opt) + ifp->if_addrlen; len = (len + 7) & ~7; /* round by 8 */ /* safety check */ if (len + (p - (u_char *)ip6) > maxlen) goto nolladdropt; if (!(rt_router->rt_flags & RTF_GATEWAY) && (rt_router->rt_flags & RTF_LLINFO) && (rt_router->rt_gateway->sa_family == AF_LINK) && (sdl = (struct sockaddr_dl *)rt_router->rt_gateway) && sdl->sdl_alen) { nd_opt = (struct nd_opt_hdr *)p; nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = len >> 3; lladdr = (char *)(nd_opt + 1); bcopy(LLADDR(sdl), lladdr, ifp->if_addrlen); p += len; } } nolladdropt:; m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* just to be safe */ #ifdef M_DECRYPTED /*not openbsd*/ if (m0->m_flags & M_DECRYPTED) goto noredhdropt; #endif if (p - (u_char *)ip6 > maxlen) goto noredhdropt; { /* redirected header option */ int len; struct nd_opt_rd_hdr *nd_opt_rh; /* * compute the maximum size for icmp6 redirect header option. * XXX room for auth header? */ len = maxlen - (p - (u_char *)ip6); len &= ~7; /* This is just for simplicity. */ if (m0->m_pkthdr.len != m0->m_len) { if (m0->m_next) { m_freem(m0->m_next); m0->m_next = NULL; } m0->m_pkthdr.len = m0->m_len; } /* * Redirected header option spec (RFC2461 4.6.3) talks nothing * about padding/truncate rule for the original IP packet. * From the discussion on IPv6imp in Feb 1999, the consensus was: * - "attach as much as possible" is the goal * - pad if not aligned (original size can be guessed by original * ip6 header) * Following code adds the padding if it is simple enough, * and truncates if not. */ if (m0->m_next || m0->m_pkthdr.len != m0->m_len) panic("assumption failed in %s:%d\n", __FILE__, __LINE__); if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) { /* not enough room, truncate */ m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } else { /* enough room, pad or truncate */ size_t extra; extra = m0->m_pkthdr.len % 8; if (extra) { /* pad if easy enough, truncate if not */ if (8 - extra <= M_TRAILINGSPACE(m0)) { /* pad */ m0->m_len += (8 - extra); m0->m_pkthdr.len += (8 - extra); } else { /* truncate */ m0->m_pkthdr.len -= extra; m0->m_len -= extra; } } len = m0->m_pkthdr.len + sizeof(*nd_opt_rh); m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } nd_opt_rh = (struct nd_opt_rd_hdr *)p; bzero(nd_opt_rh, sizeof(*nd_opt_rh)); nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER; nd_opt_rh->nd_opt_rh_len = len >> 3; p += sizeof(*nd_opt_rh); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* connect m0 to m */ m->m_next = m0; m->m_pkthdr.len = m->m_len + m0->m_len; } noredhdropt:; if (IN6_IS_ADDR_LINKLOCAL(&sip6->ip6_src)) sip6->ip6_src.s6_addr16[1] = 0; if (IN6_IS_ADDR_LINKLOCAL(&sip6->ip6_dst)) sip6->ip6_dst.s6_addr16[1] = 0; #if 0 if (IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) ip6->ip6_src.s6_addr16[1] = 0; if (IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) ip6->ip6_dst.s6_addr16[1] = 0; #endif if (IN6_IS_ADDR_LINKLOCAL(&nd_rd->nd_rd_target)) nd_rd->nd_rd_target.s6_addr16[1] = 0; if (IN6_IS_ADDR_LINKLOCAL(&nd_rd->nd_rd_dst)) nd_rd->nd_rd_dst.s6_addr16[1] = 0; ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); nd_rd->nd_rd_cksum = 0; nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), ntohs(ip6->ip6_plen)); /* send the packet to outside... */ -#ifdef IPSEC - /* Don't lookup socket */ - (void)ipsec_setsocket(m, NULL); -#endif /*IPSEC*/ - ip6_output(m, NULL, NULL, 0, NULL, &outif); + ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); } icmp6stat.icp6s_outhist[ND_REDIRECT]++; return; fail: if (m) m_freem(m); if (m0) m_freem(m0); } #ifdef HAVE_NRL_INPCB #define sotoin6pcb sotoinpcb #define in6pcb inpcb #define in6p_icmp6filt inp_icmp6filt #endif /* * ICMPv6 socket option processing. */ int icmp6_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { int error = 0; int optlen; struct inpcb *inp = sotoinpcb(so); int level, op, optname; if (sopt) { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; } else level = op = optname = optlen = 0; if (level != IPPROTO_ICMPV6) { return EINVAL; } switch (op) { case PRCO_SETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter *p; if (optlen != sizeof(*p)) { error = EMSGSIZE; break; } if (inp->in6p_icmp6filt == NULL) { error = EINVAL; break; } error = sooptcopyin(sopt, inp->in6p_icmp6filt, optlen, optlen); break; } default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (optname) { case ICMP6_FILTER: { if (inp->in6p_icmp6filt == NULL) { error = EINVAL; break; } error = sooptcopyout(sopt, inp->in6p_icmp6filt, sizeof(struct icmp6_filter)); break; } default: error = ENOPROTOOPT; break; } break; } return(error); } #ifdef HAVE_NRL_INPCB #undef sotoin6pcb #undef in6pcb #undef in6p_icmp6filt #endif #ifndef HAVE_PPSRATECHECK #ifndef timersub #define timersub(tvp, uvp, vvp) \ do { \ (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ if ((vvp)->tv_usec < 0) { \ (vvp)->tv_sec--; \ (vvp)->tv_usec += 1000000; \ } \ } while (0) #endif /* * ppsratecheck(): packets (or events) per second limitation. */ static int ppsratecheck(lasttime, curpps, maxpps) struct timeval *lasttime; int *curpps; int maxpps; /* maximum pps allowed */ { struct timeval tv, delta; int s, rv; s = splclock(); microtime(&tv); splx(s); timersub(&tv, lasttime, &delta); /* * Check for 0,0 so that the message will be seen at least once. * If more than one second has passed since the last update of * lasttime, reset the counter. * * We do increment *curpps even in *curpps < maxpps case, as some may * try to use *curpps for stat purposes as well. */ if ((lasttime->tv_sec == 0 && lasttime->tv_usec == 0) || delta.tv_sec >= 1) { *lasttime = tv; *curpps = 0; rv = 1; } else if (maxpps < 0) rv = 1; else if (*curpps < maxpps) rv = 1; else rv = 0; #if 1 /* DIAGNOSTIC? */ /* be careful about wrap-around */ if (*curpps + 1 > *curpps) *curpps = *curpps + 1; #else /* * assume that there's not too many calls to this function. * not sure if the assumption holds, as it depends on *caller's* * behavior, not the behavior of this function. * IMHO it is wrong to make assumption on the caller's behavior, * so the above #if is #if 1, not #ifdef DIAGNOSTIC. */ *curpps = *curpps + 1; #endif return (rv); } #endif /* * Perform rate limit check. * Returns 0 if it is okay to send the icmp6 packet. * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate * limitation. * * XXX per-destination/type check necessary? */ static int icmp6_ratelimit(dst, type, code) const struct in6_addr *dst; /* not used at this moment */ const int type; /* not used at this moment */ const int code; /* not used at this moment */ { int ret; ret = 0; /* okay to send */ /* PPS limit */ if (!ppsratecheck(&icmp6errppslim_last, &icmp6errpps_count, icmp6errppslim)) { /* The packet is subject to rate limit */ ret++; } return ret; } Index: head/sys/netinet6/in6_gif.c =================================================================== --- head/sys/netinet6/in6_gif.c (revision 105193) +++ head/sys/netinet6/in6_gif.c (revision 105194) @@ -1,343 +1,343 @@ /* $FreeBSD$ */ /* $KAME: in6_gif.c,v 1.49 2001/05/14 14:02:17 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #include #ifdef INET6 #include #include #include #include #endif #include #ifdef INET6 #include #endif #include #include int in6_gif_output(ifp, family, m, rt) struct ifnet *ifp; int family; /* family of the packet to be encapsulate. */ struct mbuf *m; struct rtentry *rt; { struct gif_softc *sc = (struct gif_softc*)ifp; struct sockaddr_in6 *dst = (struct sockaddr_in6 *)&sc->gif_ro6.ro_dst; struct sockaddr_in6 *sin6_src = (struct sockaddr_in6 *)sc->gif_psrc; struct sockaddr_in6 *sin6_dst = (struct sockaddr_in6 *)sc->gif_pdst; struct ip6_hdr *ip6; int proto; u_int8_t itos, otos; if (sin6_src == NULL || sin6_dst == NULL || sin6_src->sin6_family != AF_INET6 || sin6_dst->sin6_family != AF_INET6) { m_freem(m); return EAFNOSUPPORT; } switch (family) { #ifdef INET case AF_INET: { struct ip *ip; proto = IPPROTO_IPV4; if (m->m_len < sizeof(*ip)) { m = m_pullup(m, sizeof(*ip)); if (!m) return ENOBUFS; } ip = mtod(m, struct ip *); itos = ip->ip_tos; break; } #endif #ifdef INET6 case AF_INET6: { struct ip6_hdr *ip6; proto = IPPROTO_IPV6; if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) return ENOBUFS; } ip6 = mtod(m, struct ip6_hdr *); itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; break; } #endif default: #ifdef DEBUG printf("in6_gif_output: warning: unknown family %d passed\n", family); #endif m_freem(m); return EAFNOSUPPORT; } /* prepend new IP header */ M_PREPEND(m, sizeof(struct ip6_hdr), M_DONTWAIT); if (m && m->m_len < sizeof(struct ip6_hdr)) m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) { printf("ENOBUFS in in6_gif_output %d\n", __LINE__); return ENOBUFS; } ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_plen = htons((u_short)m->m_pkthdr.len); ip6->ip6_nxt = proto; ip6->ip6_hlim = ip6_gif_hlim; ip6->ip6_src = sin6_src->sin6_addr; /* bidirectional configured tunnel mode */ if (!IN6_IS_ADDR_UNSPECIFIED(&sin6_dst->sin6_addr)) ip6->ip6_dst = sin6_dst->sin6_addr; else { m_freem(m); return ENETUNREACH; } if (ifp->if_flags & IFF_LINK1) ip_ecn_ingress(ECN_ALLOWED, &otos, &itos); else ip_ecn_ingress(ECN_NOCARE, &otos, &itos); ip6->ip6_flow &= ~ntohl(0xff00000); ip6->ip6_flow |= htonl((u_int32_t)otos << 20); if (dst->sin6_family != sin6_dst->sin6_family || !IN6_ARE_ADDR_EQUAL(&dst->sin6_addr, &sin6_dst->sin6_addr)) { /* cache route doesn't match */ bzero(dst, sizeof(*dst)); dst->sin6_family = sin6_dst->sin6_family; dst->sin6_len = sizeof(struct sockaddr_in6); dst->sin6_addr = sin6_dst->sin6_addr; if (sc->gif_ro6.ro_rt) { RTFREE(sc->gif_ro6.ro_rt); sc->gif_ro6.ro_rt = NULL; } #if 0 sc->gif_if.if_mtu = GIF_MTU; #endif } if (sc->gif_ro6.ro_rt == NULL) { rtalloc((struct route *)&sc->gif_ro6); if (sc->gif_ro6.ro_rt == NULL) { m_freem(m); return ENETUNREACH; } /* if it constitutes infinite encapsulation, punt. */ if (sc->gif_ro.ro_rt->rt_ifp == ifp) { m_freem(m); return ENETUNREACH; /*XXX*/ } #if 0 ifp->if_mtu = sc->gif_ro6.ro_rt->rt_ifp->if_mtu - sizeof(struct ip6_hdr); #endif } #ifdef IPV6_MINMTU /* * force fragmentation to minimum MTU, to avoid path MTU discovery. * it is too painful to ask for resend of inner packet, to achieve * path MTU discovery for encapsulated packets. */ - return(ip6_output(m, 0, &sc->gif_ro6, IPV6_MINMTU, 0, NULL)); + return(ip6_output(m, 0, &sc->gif_ro6, IPV6_MINMTU, 0, NULL, NULL)); #else - return(ip6_output(m, 0, &sc->gif_ro6, 0, 0, NULL)); + return(ip6_output(m, 0, &sc->gif_ro6, 0, 0, NULL, NULL)); #endif } int in6_gif_input(mp, offp, proto) struct mbuf **mp; int *offp, proto; { struct mbuf *m = *mp; struct ifnet *gifp = NULL; struct ip6_hdr *ip6; int af = 0; u_int32_t otos; ip6 = mtod(m, struct ip6_hdr *); gifp = (struct ifnet *)encap_getarg(m); if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { m_freem(m); ip6stat.ip6s_nogif++; return IPPROTO_DONE; } otos = ip6->ip6_flow; m_adj(m, *offp); switch (proto) { #ifdef INET case IPPROTO_IPV4: { struct ip *ip; u_int8_t otos8; af = AF_INET; otos8 = (ntohl(otos) >> 20) & 0xff; if (m->m_len < sizeof(*ip)) { m = m_pullup(m, sizeof(*ip)); if (!m) return IPPROTO_DONE; } ip = mtod(m, struct ip *); if (gifp->if_flags & IFF_LINK1) ip_ecn_egress(ECN_ALLOWED, &otos8, &ip->ip_tos); else ip_ecn_egress(ECN_NOCARE, &otos8, &ip->ip_tos); break; } #endif /* INET */ #ifdef INET6 case IPPROTO_IPV6: { struct ip6_hdr *ip6; af = AF_INET6; if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) return IPPROTO_DONE; } ip6 = mtod(m, struct ip6_hdr *); if (gifp->if_flags & IFF_LINK1) ip6_ecn_egress(ECN_ALLOWED, &otos, &ip6->ip6_flow); else ip6_ecn_egress(ECN_NOCARE, &otos, &ip6->ip6_flow); break; } #endif default: ip6stat.ip6s_nogif++; m_freem(m); return IPPROTO_DONE; } gif_input(m, af, gifp); return IPPROTO_DONE; } /* * we know that we are in IFF_UP, outer address available, and outer family * matched the physical addr family. see gif_encapcheck(). */ int gif_encapcheck6(m, off, proto, arg) const struct mbuf *m; int off; int proto; void *arg; { struct ip6_hdr ip6; struct gif_softc *sc; struct sockaddr_in6 *src, *dst; int addrmatch; /* sanity check done in caller */ sc = (struct gif_softc *)arg; src = (struct sockaddr_in6 *)sc->gif_psrc; dst = (struct sockaddr_in6 *)sc->gif_pdst; m_copydata(m, 0, sizeof(ip6), (caddr_t)&ip6); /* check for address match */ addrmatch = 0; if (IN6_ARE_ADDR_EQUAL(&src->sin6_addr, &ip6.ip6_dst)) addrmatch |= 1; if (IN6_ARE_ADDR_EQUAL(&dst->sin6_addr, &ip6.ip6_src)) addrmatch |= 2; if (addrmatch != 3) return 0; /* martian filters on outer source - done in ip6_input */ /* ingress filters on outer source */ if ((sc->gif_if.if_flags & IFF_LINK2) == 0 && (m->m_flags & M_PKTHDR) != 0 && m->m_pkthdr.rcvif) { struct sockaddr_in6 sin6; struct rtentry *rt; bzero(&sin6, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_addr = ip6.ip6_src; /* XXX scopeid */ rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL); if (!rt || rt->rt_ifp != m->m_pkthdr.rcvif) { #if 0 log(LOG_WARNING, "%s: packet from %s dropped " "due to ingress filter\n", if_name(&sc->gif_if), ip6_sprintf(&sin6.sin6_addr)); #endif if (rt) rtfree(rt); return 0; } rtfree(rt); } return 128 * 2; } Index: head/sys/netinet6/ip6_input.c =================================================================== --- head/sys/netinet6/ip6_input.c (revision 105193) +++ head/sys/netinet6/ip6_input.c (revision 105194) @@ -1,1672 +1,1657 @@ /* $FreeBSD$ */ /* $KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include "opt_ip6fw.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_pfil_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PFIL_HOOKS #include #endif #include #include #ifdef INET #include #include #endif /* INET */ #include #include #include #include #include #include #include #include #ifdef IPSEC #include #ifdef INET6 #include #endif #endif #include #include #include extern struct domain inet6domain; u_char ip6_protox[IPPROTO_MAX]; static int ip6qmaxlen = IFQ_MAXLEN; struct in6_ifaddr *in6_ifaddr; extern struct callout in6_tmpaddrtimer_ch; int ip6_forward_srcrt; /* XXX */ int ip6_sourcecheck; /* XXX */ int ip6_sourcecheck_interval; /* XXX */ int ip6_ours_check_algorithm; /* firewall hooks */ ip6_fw_chk_t *ip6_fw_chk_ptr; ip6_fw_ctl_t *ip6_fw_ctl_ptr; int ip6_fw_enable = 1; struct ip6stat ip6stat; static void ip6_init2 __P((void *)); -static struct mbuf *ip6_setdstifaddr __P((struct mbuf *, struct in6_ifaddr *)); +static struct ip6aux *ip6_setdstifaddr __P((struct mbuf *, struct in6_ifaddr *)); static int ip6_hopopts_input __P((u_int32_t *, u_int32_t *, struct mbuf **, int *)); #ifdef PULLDOWN_TEST static struct mbuf *ip6_pullexthdr __P((struct mbuf *, size_t, int)); #endif /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. */ void ip6_init() { struct ip6protosw *pr; int i; struct timeval tv; #ifdef DIAGNOSTIC if (sizeof(struct protosw) != sizeof(struct ip6protosw)) panic("sizeof(protosw) != sizeof(ip6protosw)"); #endif pr = (struct ip6protosw *)pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == 0) panic("ip6_init"); for (i = 0; i < IPPROTO_MAX; i++) ip6_protox[i] = pr - inet6sw; for (pr = (struct ip6protosw *)inet6domain.dom_protosw; pr < (struct ip6protosw *)inet6domain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET6 && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) ip6_protox[pr->pr_protocol] = pr - inet6sw; ip6intrq.ifq_maxlen = ip6qmaxlen; mtx_init(&ip6intrq.ifq_mtx, "ip6_inq", NULL, MTX_DEF); ip6intrq_present = 1; register_netisr(NETISR_IPV6, ip6intr); nd6_init(); frag6_init(); /* * in many cases, random() here does NOT return random number * as initialization during bootstrap time occur in fixed order. */ microtime(&tv); ip6_flow_seq = random() ^ tv.tv_usec; microtime(&tv); ip6_desync_factor = (random() ^ tv.tv_usec) % MAX_TEMP_DESYNC_FACTOR; } static void ip6_init2(dummy) void *dummy; { /* * to route local address of p2p link to loopback, * assign loopback address first. */ in6_ifattach(&loif[0], NULL); /* nd6_timer_init */ callout_init(&nd6_timer_ch, 0); callout_reset(&nd6_timer_ch, hz, nd6_timer, NULL); /* router renumbering prefix list maintenance */ callout_init(&in6_rr_timer_ch, 0); callout_reset(&in6_rr_timer_ch, hz, in6_rr_timer, NULL); /* timer for regeneranation of temporary addresses randomize ID */ callout_reset(&in6_tmpaddrtimer_ch, (ip6_temp_preferred_lifetime - ip6_desync_factor - ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, NULL); } /* cheat */ /* This must be after route_init(), which is now SI_ORDER_THIRD */ SYSINIT(netinet6init2, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ip6_init2, NULL); /* * IP6 input interrupt handling. Just pass the packet to ip6_input. */ void ip6intr() { int s; struct mbuf *m; for (;;) { s = splimp(); IF_DEQUEUE(&ip6intrq, m); splx(s); if (m == 0) return; ip6_input(m); } } extern struct route_in6 ip6_forward_rt; void ip6_input(m) struct mbuf *m; { struct ip6_hdr *ip6; int off = sizeof(struct ip6_hdr), nest; u_int32_t plen; u_int32_t rtalert = ~0; int nxt, ours = 0; struct ifnet *deliverifp = NULL; #ifdef PFIL_HOOKS struct packet_filter_hook *pfh; struct mbuf *m0; int rv; #endif /* PFIL_HOOKS */ #ifdef IPSEC /* * should the inner packet be considered authentic? * see comment in ah4_input(). */ if (m) { m->m_flags &= ~M_AUTHIPHDR; m->m_flags &= ~M_AUTHIPDGM; } #endif /* * make sure we don't have onion peering information into m_aux. */ ip6_delaux(m); /* * mbuf statistics */ if (m->m_flags & M_EXT) { if (m->m_next) ip6stat.ip6s_mext2m++; else ip6stat.ip6s_mext1++; } else { #define M2MMAX (sizeof(ip6stat.ip6s_m2m)/sizeof(ip6stat.ip6s_m2m[0])) if (m->m_next) { if (m->m_flags & M_LOOP) { ip6stat.ip6s_m2m[loif[0].if_index]++; /* XXX */ } else if (m->m_pkthdr.rcvif->if_index < M2MMAX) ip6stat.ip6s_m2m[m->m_pkthdr.rcvif->if_index]++; else ip6stat.ip6s_m2m[0]++; } else ip6stat.ip6s_m1++; #undef M2MMAX } in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive); ip6stat.ip6s_total++; #ifndef PULLDOWN_TEST /* * L2 bridge code and some other code can return mbuf chain * that does not conform to KAME requirement. too bad. * XXX: fails to join if interface MTU > MCLBYTES. jumbogram? */ if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) { struct mbuf *n; MGETHDR(n, M_DONTWAIT, MT_HEADER); if (n) M_COPY_PKTHDR(n, m); if (n && m->m_pkthdr.len > MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; } } if (n == NULL) { m_freem(m); return; /*ENOBUFS*/ } m_copydata(m, 0, m->m_pkthdr.len, mtod(n, caddr_t)); n->m_len = m->m_pkthdr.len; m_freem(m); m = n; } IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /*nothing*/); #endif if (m->m_len < sizeof(struct ip6_hdr)) { struct ifnet *inifp; inifp = m->m_pkthdr.rcvif; if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == 0) { ip6stat.ip6s_toosmall++; in6_ifstat_inc(inifp, ifs6_in_hdrerr); return; } } ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { ip6stat.ip6s_badvers++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); goto bad; } #ifdef PFIL_HOOKS /* * Run through list of hooks for input packets. If there are any * filters which require that additional packets in the flow are * not fast-forwarded, they must clear the M_CANFASTFWD flag. * Note that filters must _never_ set this flag, as another filter * in the list may have previously cleared it. */ m0 = m; pfh = pfil_hook_get(PFIL_IN, &inet6sw[ip6_protox[IPPROTO_IPV6]].pr_pfh); for (; pfh; pfh = pfh->pfil_link.tqe_next) if (pfh->pfil_func) { rv = pfh->pfil_func(ip6, sizeof(*ip6), m->m_pkthdr.rcvif, 0, &m0); if (rv) return; m = m0; if (m == NULL) return; ip6 = mtod(m, struct ip6_hdr *); } #endif /* PFIL_HOOKS */ ip6stat.ip6s_nxthist[ip6->ip6_nxt]++; /* * Check with the firewall... */ if (ip6_fw_enable && ip6_fw_chk_ptr) { u_short port = 0; /* If ipfw says divert, we have to just drop packet */ /* use port as a dummy argument */ if ((*ip6_fw_chk_ptr)(&ip6, NULL, &port, &m)) { m_freem(m); m = NULL; } if (!m) return; } /* * Check against address spoofing/corruption. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) { /* * XXX: "badscope" is not very suitable for a multicast source. */ ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } if ((IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) || IN6_IS_ADDR_LOOPBACK(&ip6->ip6_dst)) && (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } /* * The following check is not documented in specs. A malicious * party may be able to use IPv4 mapped addr to confuse tcp/udp stack * and bypass security checks (act as if it was from 127.0.0.1 by using * IPv6 src ::ffff:127.0.0.1). Be cautious. * * This check chokes if we are in an SIIT cloud. As none of BSDs * support IPv4-less kernel compilation, we cannot support SIIT * environment at all. So, it makes more sense for us to reject any * malicious packets for non-SIIT environment, than try to do a * partical support for SIIT environment. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #if 0 /* * Reject packets with IPv4 compatible addresses (auto tunnel). * * The code forbids auto tunnel relay case in RFC1933 (the check is * stronger than RFC1933). We may want to re-enable it if mech-xx * is revised to forbid relaying case. */ if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #endif /* drop packets if interface ID portion is already filled */ if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src) && ip6->ip6_src.s6_addr16[1]) { ip6stat.ip6s_badscope++; goto bad; } if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst) && ip6->ip6_dst.s6_addr16[1]) { ip6stat.ip6s_badscope++; goto bad; } } if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) ip6->ip6_src.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) ip6->ip6_dst.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); #if 0 /* this case seems to be unnecessary. (jinmei, 20010401) */ /* * We use rt->rt_ifp to determine if the address is ours or not. * If rt_ifp is lo0, the address is ours. * The problem here is, rt->rt_ifp for fe80::%lo0/64 is set to lo0, * so any address under fe80::%lo0/64 will be mistakenly considered * local. The special case is supplied to handle the case properly * by actually looking at interface addresses * (using in6ifa_ifpwithaddr). */ if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) != 0 && IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) { if (!in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, &ip6->ip6_dst)) { icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0); /* m is already freed */ return; } ours = 1; deliverifp = m->m_pkthdr.rcvif; goto hbhcheck; } #endif /* * Multicast check */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct in6_multi *in6m = 0; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mcast); /* * See if we belong to the destination multicast group on the * arrival interface. */ IN6_LOOKUP_MULTI(ip6->ip6_dst, m->m_pkthdr.rcvif, in6m); if (in6m) ours = 1; else if (!ip6_mrouter) { ip6stat.ip6s_notmember++; ip6stat.ip6s_cantforward++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto bad; } deliverifp = m->m_pkthdr.rcvif; goto hbhcheck; } /* * Unicast check */ switch (ip6_ours_check_algorithm) { default: /* * XXX: I intentionally broke our indentation rule here, * since this switch-case is just for measurement and * therefore should soon be removed. */ if (ip6_forward_rt.ro_rt != NULL && (ip6_forward_rt.ro_rt->rt_flags & RTF_UP) != 0 && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &((struct sockaddr_in6 *)(&ip6_forward_rt.ro_dst))->sin6_addr)) ip6stat.ip6s_forward_cachehit++; else { struct sockaddr_in6 *dst6; if (ip6_forward_rt.ro_rt) { /* route is down or destination is different */ ip6stat.ip6s_forward_cachemiss++; RTFREE(ip6_forward_rt.ro_rt); ip6_forward_rt.ro_rt = 0; } bzero(&ip6_forward_rt.ro_dst, sizeof(struct sockaddr_in6)); dst6 = (struct sockaddr_in6 *)&ip6_forward_rt.ro_dst; dst6->sin6_len = sizeof(struct sockaddr_in6); dst6->sin6_family = AF_INET6; dst6->sin6_addr = ip6->ip6_dst; #ifdef SCOPEDROUTING ip6_forward_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif, &ip6->ip6_dst); #endif rtalloc_ign((struct route *)&ip6_forward_rt, RTF_PRCLONING); } #define rt6_key(r) ((struct sockaddr_in6 *)((r)->rt_nodes->rn_key)) /* * Accept the packet if the forwarding interface to the destination * according to the routing table is the loopback interface, * unless the associated route has a gateway. * Note that this approach causes to accept a packet if there is a * route to the loopback interface for the destination of the packet. * But we think it's even useful in some situations, e.g. when using * a special daemon which wants to intercept the packet. * * XXX: some OSes automatically make a cloned route for the destination * of an outgoing packet. If the outgoing interface of the packet * is a loopback one, the kernel would consider the packet to be * accepted, even if we have no such address assinged on the interface. * We check the cloned flag of the route entry to reject such cases, * assuming that route entries for our own addresses are not made by * cloning (it should be true because in6_addloop explicitly installs * the host route). However, we might have to do an explicit check * while it would be less efficient. Or, should we rather install a * reject route for such a case? */ if (ip6_forward_rt.ro_rt && (ip6_forward_rt.ro_rt->rt_flags & (RTF_HOST|RTF_GATEWAY)) == RTF_HOST && #ifdef RTF_WASCLONED !(ip6_forward_rt.ro_rt->rt_flags & RTF_WASCLONED) && #endif #ifdef RTF_CLONED !(ip6_forward_rt.ro_rt->rt_flags & RTF_CLONED) && #endif #if 0 /* * The check below is redundant since the comparison of * the destination and the key of the rtentry has * already done through looking up the routing table. */ IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &rt6_key(ip6_forward_rt.ro_rt)->sin6_addr) #endif ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_LOOP) { struct in6_ifaddr *ia6 = (struct in6_ifaddr *)ip6_forward_rt.ro_rt->rt_ifa; /* * record address information into m_aux. */ (void)ip6_setdstifaddr(m, ia6); /* * packets to a tentative, duplicated, or somehow invalid * address must not be accepted. */ if (!(ia6->ia6_flags & IN6_IFF_NOTREADY)) { /* this address is ready */ ours = 1; deliverifp = ia6->ia_ifp; /* correct? */ /* Count the packet in the ip address stats */ ia6->ia_ifa.if_ipackets++; ia6->ia_ifa.if_ibytes += m->m_pkthdr.len; goto hbhcheck; } else { /* address is not ready, so discard the packet. */ nd6log((LOG_INFO, "ip6_input: packet to an unready address %s->%s\n", ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst))); goto bad; } } } /* XXX indentation (see above) */ /* * FAITH(Firewall Aided Internet Translator) */ if (ip6_keepfaith) { if (ip6_forward_rt.ro_rt && ip6_forward_rt.ro_rt->rt_ifp && ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_FAITH) { /* XXX do we need more sanity checks? */ ours = 1; deliverifp = ip6_forward_rt.ro_rt->rt_ifp; /* faith */ goto hbhcheck; } } /* * Now there is no reason to process the packet if it's not our own * and we're not a router. */ if (!ip6_forwarding) { ip6stat.ip6s_cantforward++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto bad; } hbhcheck: /* * record address information into m_aux, if we don't have one yet. * note that we are unable to record it, if the address is not listed * as our interface address (e.g. multicast addresses, addresses * within FAITH prefixes and such). */ if (deliverifp && !ip6_getdstifaddr(m)) { struct in6_ifaddr *ia6; ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst); if (ia6) { if (!ip6_setdstifaddr(m, ia6)) { /* * XXX maybe we should drop the packet here, * as we could not provide enough information * to the upper layers. */ } } } /* * Process Hop-by-Hop options header if it's contained. * m may be modified in ip6_hopopts_input(). * If a JumboPayload option is included, plen will also be modified. */ plen = (u_int32_t)ntohs(ip6->ip6_plen); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; if (ip6_hopopts_input(&plen, &rtalert, &m, &off)) { #if 0 /*touches NULL pointer*/ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); #endif return; /* m have already been freed */ } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* * if the payload length field is 0 and the next header field * indicates Hop-by-Hop Options header, then a Jumbo Payload * option MUST be included. */ if (ip6->ip6_plen == 0 && plen == 0) { /* * Note that if a valid jumbo payload option is * contained, ip6_hoptops_input() must set a valid * (non-zero) payload length to the variable plen. */ ip6stat.ip6s_badoptions++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&ip6->ip6_plen - (caddr_t)ip6); return; } #ifndef PULLDOWN_TEST /* ip6_hopopts_input() ensures that mbuf is contiguous */ hbh = (struct ip6_hbh *)(ip6 + 1); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { ip6stat.ip6s_tooshort++; return; } #endif nxt = hbh->ip6h_nxt; /* * accept the packet if a router alert option is included * and we act as an IPv6 router. */ if (rtalert != ~0 && ip6_forwarding) ours = 1; } else nxt = ip6->ip6_nxt; /* * Check that the amount of data in the buffers * is as at least much as the IPv6 header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) { ip6stat.ip6s_tooshort++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) { if (m->m_len == m->m_pkthdr.len) { m->m_len = sizeof(struct ip6_hdr) + plen; m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen; } else m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len); } /* * Forward if desirable. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip6_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip6_mrouter && ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) { ip6stat.ip6s_cantforward++; m_freem(m); return; } if (!ours) { m_freem(m); return; } } else if (!ours) { ip6_forward(m, 0); return; } ip6 = mtod(m, struct ip6_hdr *); /* * Malicious party may be able to use IPv4 mapped addr to confuse * tcp/udp stack and bypass security checks (act as if it was from * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1). Be cautious. * * For SIIT end node behavior, you may want to disable the check. * However, you will become vulnerable to attacks using IPv4 mapped * source. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } /* * Tell launch routine the next header */ ip6stat.ip6s_delivered++; in6_ifstat_inc(deliverifp, ifs6_in_deliver); nest = 0; while (nxt != IPPROTO_DONE) { if (ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) { ip6stat.ip6s_toomanyhdr++; goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { ip6stat.ip6s_tooshort++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } #if 0 /* * do we need to do it for every header? yeah, other * functions can play with it (like re-allocate and copy). */ mhist = ip6_addaux(m); if (mhist && M_TRAILINGSPACE(mhist) >= sizeof(nxt)) { hist = mtod(mhist, caddr_t) + mhist->m_len; bcopy(&nxt, hist, sizeof(nxt)); mhist->m_len += sizeof(nxt); } else { ip6stat.ip6s_toomanyhdr++; goto bad; } #endif #ifdef IPSEC /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if ((inet6sw[ip6_protox[nxt]].pr_flags & PR_LASTHDR) != 0 && ipsec6_in_reject(m, NULL)) { ipsec6stat.in_polvio++; goto bad; } #endif nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt); } return; bad: m_freem(m); } /* * set/grab in6_ifaddr correspond to IPv6 destination address. * XXX backward compatibility wrapper */ -static struct mbuf * +static struct ip6aux * ip6_setdstifaddr(m, ia6) struct mbuf *m; struct in6_ifaddr *ia6; { - struct mbuf *n; + struct ip6aux *n; n = ip6_addaux(m); if (n) - mtod(n, struct ip6aux *)->ip6a_dstia6 = ia6; + n->ip6a_dstia6 = ia6; return n; /* NULL if failed to set */ } struct in6_ifaddr * ip6_getdstifaddr(m) struct mbuf *m; { - struct mbuf *n; + struct ip6aux *n; n = ip6_findaux(m); if (n) - return mtod(n, struct ip6aux *)->ip6a_dstia6; + return n->ip6a_dstia6; else return NULL; } /* * Hop-by-Hop options header processing. If a valid jumbo payload option is * included, the real payload length will be stored in plenp. */ static int ip6_hopopts_input(plenp, rtalertp, mp, offp) u_int32_t *plenp; u_int32_t *rtalertp; /* XXX: should be stored more smart way */ struct mbuf **mp; int *offp; { struct mbuf *m = *mp; int off = *offp, hbhlen; struct ip6_hbh *hbh; u_int8_t *opt; /* validation of the length of the header */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_CHECK(m, off, hbhlen, -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { ip6stat.ip6s_tooshort++; return -1; } hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), hbhlen); if (hbh == NULL) { ip6stat.ip6s_tooshort++; return -1; } #endif off += hbhlen; hbhlen -= sizeof(struct ip6_hbh); opt = (u_int8_t *)hbh + sizeof(struct ip6_hbh); if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh), hbhlen, rtalertp, plenp) < 0) return(-1); *offp = off; *mp = m; return(0); } /* * Search header for all Hop-by-hop options and process each option. * This function is separate from ip6_hopopts_input() in order to * handle a case where the sending node itself process its hop-by-hop * options header. In such a case, the function is called from ip6_output(). * * The function assumes that hbh header is located right after the IPv6 header * (RFC2460 p7), opthead is pointer into data content in m, and opthead to * opthead + hbhlen is located in continuous memory region. */ int ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) struct mbuf *m; u_int8_t *opthead; int hbhlen; u_int32_t *rtalertp; u_int32_t *plenp; { struct ip6_hdr *ip6; int optlen = 0; u_int8_t *opt = opthead; u_int16_t rtalert_val; u_int32_t jumboplen; const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh); for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) { switch (*opt) { case IP6OPT_PAD1: optlen = 1; break; case IP6OPT_PADN: if (hbhlen < IP6OPT_MINLEN) { ip6stat.ip6s_toosmall++; goto bad; } optlen = *(opt + 1) + 2; break; case IP6OPT_RTALERT: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_RTALERT_LEN) { ip6stat.ip6s_toosmall++; goto bad; } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return(-1); } optlen = IP6OPT_RTALERT_LEN; bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2); *rtalertp = ntohs(rtalert_val); break; case IP6OPT_JUMBO: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_JUMBO_LEN) { ip6stat.ip6s_toosmall++; goto bad; } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return(-1); } optlen = IP6OPT_JUMBO_LEN; /* * IPv6 packets that have non 0 payload length * must not contain a jumbo payload option. */ ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); return(-1); } /* * We may see jumbolen in unaligned location, so * we'd need to perform bcopy(). */ bcopy(opt + 2, &jumboplen, sizeof(jumboplen)); jumboplen = (u_int32_t)htonl(jumboplen); #if 1 /* * if there are multiple jumbo payload options, * *plenp will be non-zero and the packet will be * rejected. * the behavior may need some debate in ipngwg - * multiple options does not make sense, however, * there's no explicit mention in specification. */ if (*plenp != 0) { ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return(-1); } #endif /* * jumbo payload length must be larger than 65535. */ if (jumboplen <= IPV6_MAXPACKET) { ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return(-1); } *plenp = jumboplen; break; default: /* unknown option */ if (hbhlen < IP6OPT_MINLEN) { ip6stat.ip6s_toosmall++; goto bad; } optlen = ip6_unknown_opt(opt, m, erroff + opt - opthead); if (optlen == -1) return(-1); optlen += 2; break; } } return(0); bad: m_freem(m); return(-1); } /* * Unknown option processing. * The third argument `off' is the offset from the IPv6 header to the option, * which is necessary if the IPv6 header the and option header and IPv6 header * is not continuous in order to return an ICMPv6 error. */ int ip6_unknown_opt(optp, m, off) u_int8_t *optp; struct mbuf *m; int off; { struct ip6_hdr *ip6; switch (IP6OPT_TYPE(*optp)) { case IP6OPT_TYPE_SKIP: /* ignore the option */ return((int)*(optp + 1)); case IP6OPT_TYPE_DISCARD: /* silently discard */ m_freem(m); return(-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return(-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ ip6stat.ip6s_badoptions++; ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return(-1); } m_freem(m); /* XXX: NOTREACHED */ return(-1); } /* * Create the "control" list for this pcb. * The function will not modify mbuf chain at all. * * with KAME mbuf chain restriction: * The routine will be called from upper layer handlers like tcp6_input(). * Thus the routine assumes that the caller (tcp6_input) have already * called IP6_EXTHDR_CHECK() and all the extension headers are located in the * very first mbuf on the mbuf chain. */ void ip6_savecontrol(in6p, mp, ip6, m) struct inpcb *in6p; struct mbuf **mp; struct ip6_hdr *ip6; struct mbuf *m; { #if __FreeBSD_version >= 500000 struct thread *td = curthread; /* XXX */ #else struct proc *td = curproc; /* XXX */ #endif int privileged = 0; int rthdr_exist = 0; if (td && !suser(td)) privileged++; #ifdef SO_TIMESTAMP if ((in6p->in6p_socket->so_options & SO_TIMESTAMP) != 0) { struct timeval tv; microtime(&tv); *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) { mp = &(*mp)->m_next; } } #endif /* RFC 2292 sec. 5 */ if ((in6p->in6p_flags & IN6P_PKTINFO) != 0) { struct in6_pktinfo pi6; bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr)); if (IN6_IS_SCOPE_LINKLOCAL(&pi6.ipi6_addr)) pi6.ipi6_addr.s6_addr16[1] = 0; pi6.ipi6_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; *mp = sbcreatecontrol((caddr_t) &pi6, sizeof(struct in6_pktinfo), IPV6_PKTINFO, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((in6p->in6p_flags & IN6P_HOPLIMIT) != 0) { int hlim = ip6->ip6_hlim & 0xff; *mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int), IPV6_HOPLIMIT, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } /* * IPV6_HOPOPTS socket option. We require super-user privilege * for the option, but it might be too strict, since there might * be some hop-by-hop options which can be returned to normal user. * See RFC 2292 section 6. */ if ((in6p->in6p_flags & IN6P_HOPOPTS) != 0 && privileged) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary * data. Note that a hop-by-hop options header must be * just after the IPv6 header, which fact is assured through * the IPv6 input processing. */ struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; int hbhlen = 0; #ifdef PULLDOWN_TEST struct mbuf *ext; #endif #ifndef PULLDOWN_TEST hbh = (struct ip6_hbh *)(ip6 + 1); hbhlen = (hbh->ip6h_len + 1) << 3; #else ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr), ip6->ip6_nxt); if (ext == NULL) { ip6stat.ip6s_tooshort++; return; } hbh = mtod(ext, struct ip6_hbh *); hbhlen = (hbh->ip6h_len + 1) << 3; if (hbhlen != ext->m_len) { m_freem(ext); ip6stat.ip6s_tooshort++; return; } #endif /* * XXX: We copy whole the header even if a jumbo * payload option is included, which option is to * be removed before returning in the RFC 2292. * Note: this constraint is removed in 2292bis. */ *mp = sbcreatecontrol((caddr_t)hbh, hbhlen, IPV6_HOPOPTS, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; #ifdef PULLDOWN_TEST m_freem(ext); #endif } } /* IPV6_DSTOPTS and IPV6_RTHDR socket options */ if ((in6p->in6p_flags & (IN6P_DSTOPTS | IN6P_RTHDRDSTOPTS)) != 0) { int proto, off, nxt; /* * go through the header chain to see if a routing header is * contained in the packet. We need this information to store * destination options headers (if any) properly. * XXX: performance issue. We should record this info when * processing extension headers in incoming routine. * (todo) use m_aux? */ proto = IPPROTO_IPV6; off = 0; nxt = -1; while (1) { int newoff; newoff = ip6_nexthdr(m, off, proto, &nxt); if (newoff < 0) break; if (newoff < off) /* invalid, check for safety */ break; if ((proto = nxt) == IPPROTO_ROUTING) { rthdr_exist = 1; break; } off = newoff; } } if ((in6p->in6p_flags & (IN6P_RTHDR | IN6P_DSTOPTS | IN6P_RTHDRDSTOPTS)) != 0) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* * Search for destination options headers or routing * header(s) through the header chain, and stores each * header as ancillary data. * Note that the order of the headers remains in * the chain of ancillary data. */ while (1) { /* is explicit loop prevention necessary? */ struct ip6_ext *ip6e = NULL; int elen; #ifdef PULLDOWN_TEST struct mbuf *ext = NULL; #endif /* * if it is not an extension header, don't try to * pull it from the chain. */ switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: goto loopend; } #ifndef PULLDOWN_TEST if (off + sizeof(*ip6e) > m->m_len) goto loopend; ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (off + elen > m->m_len) goto loopend; #else ext = ip6_pullexthdr(m, off, nxt); if (ext == NULL) { ip6stat.ip6s_tooshort++; return; } ip6e = mtod(ext, struct ip6_ext *); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (elen != ext->m_len) { m_freem(ext); ip6stat.ip6s_tooshort++; return; } #endif switch (nxt) { case IPPROTO_DSTOPTS: if ((in6p->in6p_flags & IN6P_DSTOPTS) == 0) break; /* * We also require super-user privilege for * the option. * See the comments on IN6_HOPOPTS. */ if (!privileged) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IPV6_DSTOPTS, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_ROUTING: if (!in6p->in6p_flags & IN6P_RTHDR) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IPV6_RTHDR, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: /* * other cases have been filtered in the above. * none will visit this case. here we supply * the code just in case (nxt overwritten or * other cases). */ #ifdef PULLDOWN_TEST m_freem(ext); #endif goto loopend; } /* proceed with the next header. */ off += elen; nxt = ip6e->ip6e_nxt; ip6e = NULL; #ifdef PULLDOWN_TEST m_freem(ext); ext = NULL; #endif } loopend: ; } } #ifdef PULLDOWN_TEST /* * pull single extension header from mbuf chain. returns single mbuf that * contains the result, or NULL on error. */ static struct mbuf * ip6_pullexthdr(m, off, nxt) struct mbuf *m; size_t off; int nxt; { struct ip6_ext ip6e; size_t elen; struct mbuf *n; #ifdef DIAGNOSTIC switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: printf("ip6_pullexthdr: invalid nxt=%d\n", nxt); } #endif m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxt == IPPROTO_AH) elen = (ip6e.ip6e_len + 2) << 2; else elen = (ip6e.ip6e_len + 1) << 3; MGET(n, M_DONTWAIT, MT_DATA); if (n && elen >= MLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (!n) return NULL; n->m_len = 0; if (elen >= M_TRAILINGSPACE(n)) { m_free(n); return NULL; } m_copydata(m, off, elen, mtod(n, caddr_t)); n->m_len = elen; return n; } #endif /* * Get pointer to the previous header followed by the header * currently processed. * XXX: This function supposes that * M includes all headers, * the next header field and the header length field of each header * are valid, and * the sum of each header length equals to OFF. * Because of these assumptions, this function must be called very * carefully. Moreover, it will not be used in the near future when * we develop `neater' mechanism to process extension headers. */ char * ip6_get_prevhdr(m, off) struct mbuf *m; int off; { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); if (off == sizeof(struct ip6_hdr)) return(&ip6->ip6_nxt); else { int len, nxt; struct ip6_ext *ip6e = NULL; nxt = ip6->ip6_nxt; len = sizeof(struct ip6_hdr); while (len < off) { ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + len); switch (nxt) { case IPPROTO_FRAGMENT: len += sizeof(struct ip6_frag); break; case IPPROTO_AH: len += (ip6e->ip6e_len + 2) << 2; break; default: len += (ip6e->ip6e_len + 1) << 3; break; } nxt = ip6e->ip6e_nxt; } if (ip6e) return(&ip6e->ip6e_nxt); else return NULL; } } /* * get next header offset. m will be retained. */ int ip6_nexthdr(m, off, proto, nxtp) struct mbuf *m; int off; int proto; int *nxtp; { struct ip6_hdr ip6; struct ip6_ext ip6e; struct ip6_frag fh; /* just in case */ if (m == NULL) panic("ip6_nexthdr: m == NULL"); if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off) return -1; switch (proto) { case IPPROTO_IPV6: if (m->m_pkthdr.len < off + sizeof(ip6)) return -1; m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6); if (nxtp) *nxtp = ip6.ip6_nxt; off += sizeof(ip6); return off; case IPPROTO_FRAGMENT: /* * terminate parsing if it is not the first fragment, * it does not make sense to parse through it. */ if (m->m_pkthdr.len < off + sizeof(fh)) return -1; m_copydata(m, off, sizeof(fh), (caddr_t)&fh); if ((ntohs(fh.ip6f_offlg) & IP6F_OFF_MASK) != 0) return -1; if (nxtp) *nxtp = fh.ip6f_nxt; off += sizeof(struct ip6_frag); return off; case IPPROTO_AH: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 2) << 2; return off; case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 1) << 3; return off; case IPPROTO_NONE: case IPPROTO_ESP: case IPPROTO_IPCOMP: /* give up */ return -1; default: return -1; } return -1; } /* * get offset for the last header in the chain. m will be kept untainted. */ int ip6_lasthdr(m, off, proto, nxtp) struct mbuf *m; int off; int proto; int *nxtp; { int newoff; int nxt; if (!nxtp) { nxt = -1; nxtp = &nxt; } while (1) { newoff = ip6_nexthdr(m, off, proto, nxtp); if (newoff < 0) return off; else if (newoff < off) return -1; /* invalid */ else if (newoff == off) return newoff; off = newoff; proto = *nxtp; } } -struct mbuf * +struct ip6aux * ip6_addaux(m) struct mbuf *m; { - struct mbuf *n; - -#ifdef DIAGNOSTIC - if (sizeof(struct ip6aux) > MHLEN) - panic("assumption failed on sizeof(ip6aux)"); -#endif - n = m_aux_find(m, AF_INET6, -1); - if (n) { - if (n->m_len < sizeof(struct ip6aux)) { - printf("conflicting use of ip6aux"); - return NULL; - } - } else { - n = m_aux_add(m, AF_INET6, -1); - n->m_len = sizeof(struct ip6aux); - bzero(mtod(n, caddr_t), n->m_len); + struct m_tag *tag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL); + if (!tag) { + tag = m_tag_get(PACKET_TAG_IPV6_INPUT, + sizeof (struct ip6aux), + M_NOWAIT); + if (tag) + m_tag_prepend(m, tag); } - return n; + if (tag) + bzero(tag+1, sizeof (struct ip6aux)); + return tag ? (struct ip6aux*)(tag+1) : NULL; } -struct mbuf * +struct ip6aux * ip6_findaux(m) struct mbuf *m; { - struct mbuf *n; - - n = m_aux_find(m, AF_INET6, -1); - if (n && n->m_len < sizeof(struct ip6aux)) { - printf("conflicting use of ip6aux"); - n = NULL; - } - return n; + struct m_tag *tag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL); + return tag ? (struct ip6aux*)(tag+1) : NULL; } void ip6_delaux(m) struct mbuf *m; { - struct mbuf *n; - - n = m_aux_find(m, AF_INET6, -1); - if (n) - m_aux_delete(m, n); + struct m_tag *tag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL); + if (tag) + m_tag_delete(m, tag); } /* * System control for IP6 */ u_char inet6ctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, 0, 0, ENOPROTOOPT }; Index: head/sys/netinet6/ip6_mroute.c =================================================================== --- head/sys/netinet6/ip6_mroute.c (revision 105193) +++ head/sys/netinet6/ip6_mroute.c (revision 105194) @@ -1,1814 +1,1814 @@ /* $FreeBSD$ */ /* $KAME: ip6_mroute.c,v 1.58 2001/12/18 02:36:31 itojun Exp $ */ /* * Copyright (C) 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* BSDI ip_mroute.c,v 2.10 1996/11/14 00:29:52 jch Exp */ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenenr, PARC, April 1994 * * MROUTING Revision: 3.5.1.2 + PIM-SMv2 (pimd) Support */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_MRTABLE, "mf6c", "multicast forwarding cache entry"); #define M_HASCL(m) ((m)->m_flags & M_EXT) static int ip6_mdq __P((struct mbuf *, struct ifnet *, struct mf6c *)); static void phyint_send __P((struct ip6_hdr *, struct mif6 *, struct mbuf *)); static int set_pim6 __P((int *)); static int socket_send __P((struct socket *, struct mbuf *, struct sockaddr_in6 *)); static int register_send __P((struct ip6_hdr *, struct mif6 *, struct mbuf *)); /* * Globals. All but ip6_mrouter, ip6_mrtproto and mrt6stat could be static, * except for netstat or debugging purposes. */ struct socket *ip6_mrouter = NULL; int ip6_mrouter_ver = 0; int ip6_mrtproto = IPPROTO_PIM; /* for netstat only */ struct mrt6stat mrt6stat; #define NO_RTE_FOUND 0x1 #define RTE_FOUND 0x2 struct mf6c *mf6ctable[MF6CTBLSIZ]; u_char n6expire[MF6CTBLSIZ]; static struct mif6 mif6table[MAXMIFS]; #ifdef MRT6DEBUG u_int mrt6debug = 0; /* debug level */ #define DEBUG_MFC 0x02 #define DEBUG_FORWARD 0x04 #define DEBUG_EXPIRE 0x08 #define DEBUG_XMIT 0x10 #define DEBUG_REG 0x20 #define DEBUG_PIM 0x40 #endif static void expire_upcalls __P((void *)); #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ #ifdef INET #ifdef MROUTING extern struct socket *ip_mrouter; #endif #endif /* * 'Interfaces' associated with decapsulator (so we can tell * packets that went through it from ones that get reflected * by a broken gateway). These interfaces are never linked into * the system ifnet list & no routes point to them. I.e., packets * can't be sent this way. They only exist as a placeholder for * multicast source verification. */ struct ifnet multicast_register_if; #define ENCAP_HOPS 64 /* * Private variables. */ static mifi_t nummifs = 0; static mifi_t reg_mif_num = (mifi_t)-1; static struct pim6stat pim6stat; static int pim6; /* * Hash function for a source, group entry */ #define MF6CHASH(a, g) MF6CHASHMOD((a).s6_addr32[0] ^ (a).s6_addr32[1] ^ \ (a).s6_addr32[2] ^ (a).s6_addr32[3] ^ \ (g).s6_addr32[0] ^ (g).s6_addr32[1] ^ \ (g).s6_addr32[2] ^ (g).s6_addr32[3]) /* * Find a route for a given origin IPv6 address and Multicast group address. * Quality of service parameter to be added in the future!!! */ #define MF6CFIND(o, g, rt) do { \ struct mf6c *_rt = mf6ctable[MF6CHASH(o,g)]; \ rt = NULL; \ mrt6stat.mrt6s_mfc_lookups++; \ while (_rt) { \ if (IN6_ARE_ADDR_EQUAL(&_rt->mf6c_origin.sin6_addr, &(o)) && \ IN6_ARE_ADDR_EQUAL(&_rt->mf6c_mcastgrp.sin6_addr, &(g)) && \ (_rt->mf6c_stall == NULL)) { \ rt = _rt; \ break; \ } \ _rt = _rt->mf6c_next; \ } \ if (rt == NULL) { \ mrt6stat.mrt6s_mfc_misses++; \ } \ } while (0) /* * Macros to compute elapsed time efficiently * Borrowed from Van Jacobson's scheduling code */ #define TV_DELTA(a, b, delta) do { \ int xxs; \ \ delta = (a).tv_usec - (b).tv_usec; \ if ((xxs = (a).tv_sec - (b).tv_sec)) { \ switch (xxs) { \ case 2: \ delta += 1000000; \ /* fall through */ \ case 1: \ delta += 1000000; \ break; \ default: \ delta += (1000000 * xxs); \ } \ } \ } while (0) #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) #ifdef UPCALL_TIMING #define UPCALL_MAX 50 u_long upcall_data[UPCALL_MAX + 1]; static void collate(); #endif /* UPCALL_TIMING */ static int get_sg_cnt __P((struct sioc_sg_req6 *)); static int get_mif6_cnt __P((struct sioc_mif_req6 *)); static int ip6_mrouter_init __P((struct socket *, struct mbuf *, int)); static int add_m6if __P((struct mif6ctl *)); static int del_m6if __P((mifi_t *)); static int add_m6fc __P((struct mf6cctl *)); static int del_m6fc __P((struct mf6cctl *)); static struct callout expire_upcalls_ch; /* * Handle MRT setsockopt commands to modify the multicast routing tables. */ int ip6_mrouter_set(so, sopt) struct socket *so; struct sockopt *sopt; { int error = 0; struct mbuf *m; if (so != ip6_mrouter && sopt->sopt_name != MRT6_INIT) return (EACCES); if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ return (error); if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ return (error); switch (sopt->sopt_name) { case MRT6_INIT: #ifdef MRT6_OINIT case MRT6_OINIT: #endif error = ip6_mrouter_init(so, m, sopt->sopt_name); break; case MRT6_DONE: error = ip6_mrouter_done(); break; case MRT6_ADD_MIF: error = add_m6if(mtod(m, struct mif6ctl *)); break; case MRT6_DEL_MIF: error = del_m6if(mtod(m, mifi_t *)); break; case MRT6_ADD_MFC: error = add_m6fc(mtod(m, struct mf6cctl *)); break; case MRT6_DEL_MFC: error = del_m6fc(mtod(m, struct mf6cctl *)); break; case MRT6_PIM: error = set_pim6(mtod(m, int *)); break; default: error = EOPNOTSUPP; break; } (void)m_freem(m); return(error); } /* * Handle MRT getsockopt commands */ int ip6_mrouter_get(so, sopt) struct socket *so; struct sockopt *sopt; { int error = 0; if (so != ip6_mrouter) return EACCES; switch (sopt->sopt_name) { case MRT6_PIM: error = sooptcopyout(sopt, &pim6, sizeof(pim6)); break; } return (error); } /* * Handle ioctl commands to obtain information from the cache */ int mrt6_ioctl(cmd, data) int cmd; caddr_t data; { int error = 0; switch (cmd) { case SIOCGETSGCNT_IN6: return(get_sg_cnt((struct sioc_sg_req6 *)data)); break; /* for safety */ case SIOCGETMIFCNT_IN6: return(get_mif6_cnt((struct sioc_mif_req6 *)data)); break; /* for safety */ default: return (EINVAL); break; } return error; } /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(req) struct sioc_sg_req6 *req; { struct mf6c *rt; int s; s = splnet(); MF6CFIND(req->src.sin6_addr, req->grp.sin6_addr, rt); splx(s); if (rt != NULL) { req->pktcnt = rt->mf6c_pkt_cnt; req->bytecnt = rt->mf6c_byte_cnt; req->wrong_if = rt->mf6c_wrong_if; } else return(ESRCH); #if 0 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; #endif return 0; } /* * returns the input and output packet and byte counts on the mif provided */ static int get_mif6_cnt(req) struct sioc_mif_req6 *req; { mifi_t mifi = req->mifi; if (mifi >= nummifs) return EINVAL; req->icount = mif6table[mifi].m6_pkt_in; req->ocount = mif6table[mifi].m6_pkt_out; req->ibytes = mif6table[mifi].m6_bytes_in; req->obytes = mif6table[mifi].m6_bytes_out; return 0; } static int set_pim6(i) int *i; { if ((*i != 1) && (*i != 0)) return EINVAL; pim6 = *i; return 0; } /* * Enable multicast routing */ static int ip6_mrouter_init(so, m, cmd) struct socket *so; struct mbuf *m; int cmd; { int *v; #ifdef MRT6DEBUG if (mrt6debug) log(LOG_DEBUG, "ip6_mrouter_init: so_type = %d, pr_protocol = %d\n", so->so_type, so->so_proto->pr_protocol); #endif if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_ICMPV6) return EOPNOTSUPP; if (!m || (m->m_len != sizeof(int *))) return ENOPROTOOPT; v = mtod(m, int *); if (*v != 1) return ENOPROTOOPT; if (ip6_mrouter != NULL) return EADDRINUSE; ip6_mrouter = so; ip6_mrouter_ver = cmd; bzero((caddr_t)mf6ctable, sizeof(mf6ctable)); bzero((caddr_t)n6expire, sizeof(n6expire)); pim6 = 0;/* used for stubbing out/in pim stuff */ callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); #ifdef MRT6DEBUG if (mrt6debug) log(LOG_DEBUG, "ip6_mrouter_init\n"); #endif return 0; } /* * Disable multicast routing */ int ip6_mrouter_done() { mifi_t mifi; int i; struct ifnet *ifp; struct in6_ifreq ifr; struct mf6c *rt; struct rtdetq *rte; int s; s = splnet(); /* * For each phyint in use, disable promiscuous reception of all IPv6 * multicasts. */ #ifdef INET #ifdef MROUTING /* * If there is still IPv4 multicast routing daemon, * we remain interfaces to receive all muliticasted packets. * XXX: there may be an interface in which the IPv4 multicast * daemon is not interested... */ if (!ip_mrouter) #endif #endif { for (mifi = 0; mifi < nummifs; mifi++) { if (mif6table[mifi].m6_ifp && !(mif6table[mifi].m6_flags & MIFF_REGISTER)) { ifr.ifr_addr.sin6_family = AF_INET6; ifr.ifr_addr.sin6_addr= in6addr_any; ifp = mif6table[mifi].m6_ifp; (*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr); } } } #ifdef notyet bzero((caddr_t)qtable, sizeof(qtable)); bzero((caddr_t)tbftable, sizeof(tbftable)); #endif bzero((caddr_t)mif6table, sizeof(mif6table)); nummifs = 0; pim6 = 0; /* used to stub out/in pim specific code */ callout_stop(&expire_upcalls_ch); /* * Free all multicast forwarding cache entries. */ for (i = 0; i < MF6CTBLSIZ; i++) { rt = mf6ctable[i]; while (rt) { struct mf6c *frt; for (rte = rt->mf6c_stall; rte != NULL; ) { struct rtdetq *n = rte->next; m_free(rte->m); free(rte, M_MRTABLE); rte = n; } frt = rt; rt = rt->mf6c_next; free(frt, M_MRTABLE); } } bzero((caddr_t)mf6ctable, sizeof(mf6ctable)); /* * Reset de-encapsulation cache */ reg_mif_num = -1; ip6_mrouter = NULL; ip6_mrouter_ver = 0; splx(s); #ifdef MRT6DEBUG if (mrt6debug) log(LOG_DEBUG, "ip6_mrouter_done\n"); #endif return 0; } static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 }; /* * Add a mif to the mif table */ static int add_m6if(mifcp) struct mif6ctl *mifcp; { struct mif6 *mifp; struct ifnet *ifp; int error, s; #ifdef notyet struct tbf *m_tbf = tbftable + mifcp->mif6c_mifi; #endif if (mifcp->mif6c_mifi >= MAXMIFS) return EINVAL; mifp = mif6table + mifcp->mif6c_mifi; if (mifp->m6_ifp) return EADDRINUSE; /* XXX: is it appropriate? */ if (mifcp->mif6c_pifi == 0 || mifcp->mif6c_pifi > if_index) return ENXIO; ifp = ifnet_byindex(mifcp->mif6c_pifi); if (mifcp->mif6c_flags & MIFF_REGISTER) { if (reg_mif_num == (mifi_t)-1) { multicast_register_if.if_name = "register_mif"; multicast_register_if.if_flags |= IFF_LOOPBACK; multicast_register_if.if_index = mifcp->mif6c_mifi; reg_mif_num = mifcp->mif6c_mifi; } ifp = &multicast_register_if; } /* if REGISTER */ else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return EOPNOTSUPP; s = splnet(); error = if_allmulti(ifp, 1); splx(s); if (error) return error; } s = splnet(); mifp->m6_flags = mifcp->mif6c_flags; mifp->m6_ifp = ifp; #ifdef notyet /* scaling up here allows division by 1024 in critical code */ mifp->m6_rate_limit = mifcp->mif6c_rate_limit * 1024 / 1000; #endif /* initialize per mif pkt counters */ mifp->m6_pkt_in = 0; mifp->m6_pkt_out = 0; mifp->m6_bytes_in = 0; mifp->m6_bytes_out = 0; splx(s); /* Adjust nummifs up if the mifi is higher than nummifs */ if (nummifs <= mifcp->mif6c_mifi) nummifs = mifcp->mif6c_mifi + 1; #ifdef MRT6DEBUG if (mrt6debug) log(LOG_DEBUG, "add_mif #%d, phyint %s%d\n", mifcp->mif6c_mifi, ifp->if_name, ifp->if_unit); #endif return 0; } /* * Delete a mif from the mif table */ static int del_m6if(mifip) mifi_t *mifip; { struct mif6 *mifp = mif6table + *mifip; mifi_t mifi; struct ifnet *ifp; int s; if (*mifip >= nummifs) return EINVAL; if (mifp->m6_ifp == NULL) return EINVAL; s = splnet(); if (!(mifp->m6_flags & MIFF_REGISTER)) { /* * XXX: what if there is yet IPv4 multicast daemon * using the interface? */ ifp = mifp->m6_ifp; if_allmulti(ifp, 0); } #ifdef notyet bzero((caddr_t)qtable[*mifip], sizeof(qtable[*mifip])); bzero((caddr_t)mifp->m6_tbf, sizeof(*(mifp->m6_tbf))); #endif bzero((caddr_t)mifp, sizeof (*mifp)); /* Adjust nummifs down */ for (mifi = nummifs; mifi > 0; mifi--) if (mif6table[mifi - 1].m6_ifp) break; nummifs = mifi; splx(s); #ifdef MRT6DEBUG if (mrt6debug) log(LOG_DEBUG, "del_m6if %d, nummifs %d\n", *mifip, nummifs); #endif return 0; } /* * Add an mfc entry */ static int add_m6fc(mfccp) struct mf6cctl *mfccp; { struct mf6c *rt; u_long hash; struct rtdetq *rte; u_short nstl; int s; MF6CFIND(mfccp->mf6cc_origin.sin6_addr, mfccp->mf6cc_mcastgrp.sin6_addr, rt); /* If an entry already exists, just update the fields */ if (rt) { #ifdef MRT6DEBUG if (mrt6debug & DEBUG_MFC) log(LOG_DEBUG, "add_m6fc no upcall h %d o %s g %s p %x\n", ip6_sprintf(&mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(&mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent); #endif s = splnet(); rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; splx(s); return 0; } /* * Find the entry for which the upcall was made and update */ s = splnet(); hash = MF6CHASH(mfccp->mf6cc_origin.sin6_addr, mfccp->mf6cc_mcastgrp.sin6_addr); for (rt = mf6ctable[hash], nstl = 0; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr, &mfccp->mf6cc_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr, &mfccp->mf6cc_mcastgrp.sin6_addr) && (rt->mf6c_stall != NULL)) { if (nstl++) log(LOG_ERR, "add_m6fc: %s o %s g %s p %x dbx %p\n", "multiple kernel entries", ip6_sprintf(&mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(&mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent, rt->mf6c_stall); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_MFC) log(LOG_DEBUG, "add_m6fc o %s g %s p %x dbg %x\n", ip6_sprintf(&mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(&mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent, rt->mf6c_stall); #endif rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; rt->mf6c_expire = 0; /* Don't clean this guy up */ n6expire[hash]--; /* free packets Qed at the end of this entry */ for (rte = rt->mf6c_stall; rte != NULL; ) { struct rtdetq *n = rte->next; ip6_mdq(rte->m, rte->ifp, rt); m_freem(rte->m); #ifdef UPCALL_TIMING collate(&(rte->t)); #endif /* UPCALL_TIMING */ free(rte, M_MRTABLE); rte = n; } rt->mf6c_stall = NULL; } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { #ifdef MRT6DEBUG if (mrt6debug & DEBUG_MFC) log(LOG_DEBUG,"add_mfc no upcall h %d o %s g %s p %x\n", hash, ip6_sprintf(&mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(&mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent); #endif for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr, &mfccp->mf6cc_origin.sin6_addr)&& IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr, &mfccp->mf6cc_mcastgrp.sin6_addr)) { rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; if (rt->mf6c_expire) n6expire[hash]--; rt->mf6c_expire = 0; } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { splx(s); return ENOBUFS; } /* insert new entry at head of hash chain */ rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; rt->mf6c_expire = 0; rt->mf6c_stall = NULL; /* link into table */ rt->mf6c_next = mf6ctable[hash]; mf6ctable[hash] = rt; } } splx(s); return 0; } #ifdef UPCALL_TIMING /* * collect delay statistics on the upcalls */ static void collate(t) struct timeval *t; { u_long d; struct timeval tp; u_long delta; GET_TIME(tp); if (TV_LT(*t, tp)) { TV_DELTA(tp, *t, delta); d = delta >> 10; if (d > UPCALL_MAX) d = UPCALL_MAX; ++upcall_data[d]; } } #endif /* UPCALL_TIMING */ /* * Delete an mfc entry */ static int del_m6fc(mfccp) struct mf6cctl *mfccp; { struct sockaddr_in6 origin; struct sockaddr_in6 mcastgrp; struct mf6c *rt; struct mf6c **nptr; u_long hash; int s; origin = mfccp->mf6cc_origin; mcastgrp = mfccp->mf6cc_mcastgrp; hash = MF6CHASH(origin.sin6_addr, mcastgrp.sin6_addr); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_MFC) log(LOG_DEBUG,"del_m6fc orig %s mcastgrp %s\n", ip6_sprintf(&origin.sin6_addr), ip6_sprintf(&mcastgrp.sin6_addr)); #endif s = splnet(); nptr = &mf6ctable[hash]; while ((rt = *nptr) != NULL) { if (IN6_ARE_ADDR_EQUAL(&origin.sin6_addr, &rt->mf6c_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&mcastgrp.sin6_addr, &rt->mf6c_mcastgrp.sin6_addr) && rt->mf6c_stall == NULL) break; nptr = &rt->mf6c_next; } if (rt == NULL) { splx(s); return EADDRNOTAVAIL; } *nptr = rt->mf6c_next; free(rt, M_MRTABLE); splx(s); return 0; } static int socket_send(s, mm, src) struct socket *s; struct mbuf *mm; struct sockaddr_in6 *src; { if (s) { if (sbappendaddr(&s->so_rcv, (struct sockaddr *)src, mm, (struct mbuf *)0) != 0) { sorwakeup(s); return 0; } } m_freem(mm); return -1; } /* * IPv6 multicast forwarding function. This function assumes that the packet * pointed to by "ip6" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IPv6 multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. */ int ip6_mforward(ip6, ifp, m) struct ip6_hdr *ip6; struct ifnet *ifp; struct mbuf *m; { struct mf6c *rt; struct mif6 *mifp; struct mbuf *mm; int s; mifi_t mifi; #ifdef MRT6DEBUG if (mrt6debug & DEBUG_FORWARD) log(LOG_DEBUG, "ip6_mforward: src %s, dst %s, ifindex %d\n", ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst), ifp->if_index); #endif /* * Don't forward a packet with Hop limit of zero or one, * or a packet destined to a local-only group. */ if (ip6->ip6_hlim <= 1 || IN6_IS_ADDR_MC_NODELOCAL(&ip6->ip6_dst) || IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) return 0; ip6->ip6_hlim--; /* * Source address check: do not forward packets with unspecified * source. It was discussed in July 2000, on ipngwg mailing list. * This is rather more serious than unicast cases, because some * MLD packets can be sent with the unspecified source address * (although such packets must normally set 1 to the hop limit field). */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { ip6stat.ip6s_cantforward++; if (ip6_log_time + ip6_log_interval < time_second) { ip6_log_time = time_second; log(LOG_DEBUG, "cannot forward " "from %s to %s nxt %d received on %s\n", ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst), ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif)); } return 0; } /* * Determine forwarding mifs from the forwarding cache table */ s = splnet(); MF6CFIND(ip6->ip6_src, ip6->ip6_dst, rt); /* Entry exists, so forward if necessary */ if (rt) { splx(s); return (ip6_mdq(m, ifp, rt)); } else { /* * If we don't have a route for packet's origin, * Make a copy of the packet & * send message to routing daemon */ struct mbuf *mb0; struct rtdetq *rte; u_long hash; /* int i, npkts;*/ #ifdef UPCALL_TIMING struct timeval tp; GET_TIME(tp); #endif /* UPCALL_TIMING */ mrt6stat.mrt6s_no_route++; #ifdef MRT6DEBUG if (mrt6debug & (DEBUG_FORWARD | DEBUG_MFC)) log(LOG_DEBUG, "ip6_mforward: no rte s %s g %s\n", ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst)); #endif /* * Allocate mbufs early so that we don't do extra work if we * are just going to fail anyway. */ rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE, M_NOWAIT); if (rte == NULL) { splx(s); return ENOBUFS; } mb0 = m_copy(m, 0, M_COPYALL); /* * Pullup packet header if needed before storing it, * as other references may modify it in the meantime. */ if (mb0 && (M_HASCL(mb0) || mb0->m_len < sizeof(struct ip6_hdr))) mb0 = m_pullup(mb0, sizeof(struct ip6_hdr)); if (mb0 == NULL) { free(rte, M_MRTABLE); splx(s); return ENOBUFS; } /* is there an upcall waiting for this packet? */ hash = MF6CHASH(ip6->ip6_src, ip6->ip6_dst); for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &rt->mf6c_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &rt->mf6c_mcastgrp.sin6_addr) && (rt->mf6c_stall != NULL)) break; } if (rt == NULL) { struct mrt6msg *im; #ifdef MRT6_OINIT struct omrt6msg *oim; #endif /* no upcall, so make a new entry */ rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { free(rte, M_MRTABLE); m_freem(mb0); splx(s); return ENOBUFS; } /* * Make a copy of the header to send to the user * level process */ mm = m_copy(mb0, 0, sizeof(struct ip6_hdr)); if (mm == NULL) { free(rte, M_MRTABLE); m_freem(mb0); free(rt, M_MRTABLE); splx(s); return ENOBUFS; } /* * Send message to routing daemon */ sin6.sin6_addr = ip6->ip6_src; im = NULL; #ifdef MRT6_OINIT oim = NULL; #endif switch (ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim = mtod(mm, struct omrt6msg *); oim->im6_msgtype = MRT6MSG_NOCACHE; oim->im6_mbz = 0; break; #endif case MRT6_INIT: im = mtod(mm, struct mrt6msg *); im->im6_msgtype = MRT6MSG_NOCACHE; im->im6_mbz = 0; break; default: free(rte, M_MRTABLE); m_freem(mb0); free(rt, M_MRTABLE); splx(s); return EINVAL; } #ifdef MRT6DEBUG if (mrt6debug & DEBUG_FORWARD) log(LOG_DEBUG, "getting the iif info in the kernel\n"); #endif for (mifp = mif6table, mifi = 0; mifi < nummifs && mifp->m6_ifp != ifp; mifp++, mifi++) ; switch (ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim->im6_mif = mifi; break; #endif case MRT6_INIT: im->im6_mif = mifi; break; } if (socket_send(ip6_mrouter, mm, &sin6) < 0) { log(LOG_WARNING, "ip6_mforward: ip6_mrouter " "socket queue full\n"); mrt6stat.mrt6s_upq_sockfull++; free(rte, M_MRTABLE); m_freem(mb0); free(rt, M_MRTABLE); splx(s); return ENOBUFS; } mrt6stat.mrt6s_upcalls++; /* insert new entry at head of hash chain */ bzero(rt, sizeof(*rt)); rt->mf6c_origin.sin6_family = AF_INET6; rt->mf6c_origin.sin6_len = sizeof(struct sockaddr_in6); rt->mf6c_origin.sin6_addr = ip6->ip6_src; rt->mf6c_mcastgrp.sin6_family = AF_INET6; rt->mf6c_mcastgrp.sin6_len = sizeof(struct sockaddr_in6); rt->mf6c_mcastgrp.sin6_addr = ip6->ip6_dst; rt->mf6c_expire = UPCALL_EXPIRE; n6expire[hash]++; rt->mf6c_parent = MF6C_INCOMPLETE_PARENT; /* link into table */ rt->mf6c_next = mf6ctable[hash]; mf6ctable[hash] = rt; /* Add this entry to the end of the queue */ rt->mf6c_stall = rte; } else { /* determine if q has overflowed */ struct rtdetq **p; int npkts = 0; for (p = &rt->mf6c_stall; *p != NULL; p = &(*p)->next) if (++npkts > MAX_UPQ6) { mrt6stat.mrt6s_upq_ovflw++; free(rte, M_MRTABLE); m_freem(mb0); splx(s); return 0; } /* Add this entry to the end of the queue */ *p = rte; } rte->next = NULL; rte->m = mb0; rte->ifp = ifp; #ifdef UPCALL_TIMING rte->t = tp; #endif /* UPCALL_TIMING */ splx(s); return 0; } } /* * Clean up cache entries if upcalls are not serviced * Call from the Slow Timeout mechanism, every half second. */ static void expire_upcalls(unused) void *unused; { struct rtdetq *rte; struct mf6c *mfc, **nptr; int i; int s; s = splnet(); for (i = 0; i < MF6CTBLSIZ; i++) { if (n6expire[i] == 0) continue; nptr = &mf6ctable[i]; while ((mfc = *nptr) != NULL) { rte = mfc->mf6c_stall; /* * Skip real cache entries * Make sure it wasn't marked to not expire (shouldn't happen) * If it expires now */ if (rte != NULL && mfc->mf6c_expire != 0 && --mfc->mf6c_expire == 0) { #ifdef MRT6DEBUG if (mrt6debug & DEBUG_EXPIRE) log(LOG_DEBUG, "expire_upcalls: expiring (%s %s)\n", ip6_sprintf(&mfc->mf6c_origin.sin6_addr), ip6_sprintf(&mfc->mf6c_mcastgrp.sin6_addr)); #endif /* * drop all the packets * free the mbuf with the pkt, if, timing info */ do { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE); rte = n; } while (rte != NULL); mrt6stat.mrt6s_cache_cleanups++; n6expire[i]--; *nptr = mfc->mf6c_next; free(mfc, M_MRTABLE); } else { nptr = &mfc->mf6c_next; } } } splx(s); callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); } /* * Packet forwarding routine once entry in the cache is made */ static int ip6_mdq(m, ifp, rt) struct mbuf *m; struct ifnet *ifp; struct mf6c *rt; { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); mifi_t mifi, iif; struct mif6 *mifp; int plen = m->m_pkthdr.len; /* * Macro to send packet on mif. Since RSVP packets don't get counted on * input, they shouldn't get counted on output, so statistics keeping is * separate. */ #define MC6_SEND(ip6, mifp, m) do { \ if ((mifp)->m6_flags & MIFF_REGISTER) \ register_send((ip6), (mifp), (m)); \ else \ phyint_send((ip6), (mifp), (m)); \ } while (0) /* * Don't forward if it didn't arrive from the parent mif * for its origin. */ mifi = rt->mf6c_parent; if ((mifi >= nummifs) || (mif6table[mifi].m6_ifp != ifp)) { /* came in the wrong interface */ #ifdef MRT6DEBUG if (mrt6debug & DEBUG_FORWARD) log(LOG_DEBUG, "wrong if: ifid %d mifi %d mififid %x\n", ifp->if_index, mifi, mif6table[mifi].m6_ifp->if_index); #endif mrt6stat.mrt6s_wrong_if++; rt->mf6c_wrong_if++; /* * If we are doing PIM processing, and we are forwarding * packets on this interface, send a message to the * routing daemon. */ /* have to make sure this is a valid mif */ if (mifi < nummifs && mif6table[mifi].m6_ifp) if (pim6 && (m->m_flags & M_LOOP) == 0) { /* * Check the M_LOOP flag to avoid an * unnecessary PIM assert. * XXX: M_LOOP is an ad-hoc hack... */ static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 }; struct mbuf *mm; struct mrt6msg *im; #ifdef MRT6_OINIT struct omrt6msg *oim; #endif mm = m_copy(m, 0, sizeof(struct ip6_hdr)); if (mm && (M_HASCL(mm) || mm->m_len < sizeof(struct ip6_hdr))) mm = m_pullup(mm, sizeof(struct ip6_hdr)); if (mm == NULL) return ENOBUFS; #ifdef MRT6_OINIT oim = NULL; #endif im = NULL; switch (ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim = mtod(mm, struct omrt6msg *); oim->im6_msgtype = MRT6MSG_WRONGMIF; oim->im6_mbz = 0; break; #endif case MRT6_INIT: im = mtod(mm, struct mrt6msg *); im->im6_msgtype = MRT6MSG_WRONGMIF; im->im6_mbz = 0; break; default: m_freem(mm); return EINVAL; } for (mifp = mif6table, iif = 0; iif < nummifs && mifp && mifp->m6_ifp != ifp; mifp++, iif++) ; switch (ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim->im6_mif = iif; sin6.sin6_addr = oim->im6_src; break; #endif case MRT6_INIT: im->im6_mif = iif; sin6.sin6_addr = im->im6_src; break; } mrt6stat.mrt6s_upcalls++; if (socket_send(ip6_mrouter, mm, &sin6) < 0) { #ifdef MRT6DEBUG if (mrt6debug) log(LOG_WARNING, "mdq, ip6_mrouter socket queue full\n"); #endif ++mrt6stat.mrt6s_upq_sockfull; return ENOBUFS; } /* if socket Q full */ } /* if PIM */ return 0; } /* if wrong iif */ /* If I sourced this packet, it counts as output, else it was input. */ if (m->m_pkthdr.rcvif == NULL) { /* XXX: is rcvif really NULL when output?? */ mif6table[mifi].m6_pkt_out++; mif6table[mifi].m6_bytes_out += plen; } else { mif6table[mifi].m6_pkt_in++; mif6table[mifi].m6_bytes_in += plen; } rt->mf6c_pkt_cnt++; rt->mf6c_byte_cnt += plen; /* * For each mif, forward a copy of the packet if there are group * members downstream on the interface. */ for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) if (IF_ISSET(mifi, &rt->mf6c_ifset)) { /* * check if the outgoing packet is going to break * a scope boundary. * XXX For packets through PIM register tunnel * interface, we believe a routing daemon. */ if ((mif6table[rt->mf6c_parent].m6_flags & MIFF_REGISTER) == 0 && (mif6table[mifi].m6_flags & MIFF_REGISTER) == 0 && (in6_addr2scopeid(ifp, &ip6->ip6_dst) != in6_addr2scopeid(mif6table[mifi].m6_ifp, &ip6->ip6_dst) || in6_addr2scopeid(ifp, &ip6->ip6_src) != in6_addr2scopeid(mif6table[mifi].m6_ifp, &ip6->ip6_src))) { ip6stat.ip6s_badscope++; continue; } mifp->m6_pkt_out++; mifp->m6_bytes_out += plen; MC6_SEND(ip6, mifp, m); } return 0; } static void phyint_send(ip6, mifp, m) struct ip6_hdr *ip6; struct mif6 *mifp; struct mbuf *m; { struct mbuf *mb_copy; struct ifnet *ifp = mifp->m6_ifp; int error = 0; int s = splnet(); /* needs to protect static "ro" below. */ static struct route_in6 ro; struct in6_multi *in6m; struct sockaddr_in6 *dst6; /* * Make a new reference to the packet; make sure that * the IPv6 header is actually copied, not just referenced, * so that ip6_output() only scribbles on the copy. */ mb_copy = m_copy(m, 0, M_COPYALL); if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < sizeof(struct ip6_hdr))) mb_copy = m_pullup(mb_copy, sizeof(struct ip6_hdr)); if (mb_copy == NULL) { splx(s); return; } /* set MCAST flag to the outgoing packet */ mb_copy->m_flags |= M_MCAST; /* * If we sourced the packet, call ip6_output since we may devide * the packet into fragments when the packet is too big for the * outgoing interface. * Otherwise, we can simply send the packet to the interface * sending queue. */ if (m->m_pkthdr.rcvif == NULL) { struct ip6_moptions im6o; im6o.im6o_multicast_ifp = ifp; /* XXX: ip6_output will override ip6->ip6_hlim */ im6o.im6o_multicast_hlim = ip6->ip6_hlim; im6o.im6o_multicast_loop = 1; error = ip6_output(mb_copy, NULL, &ro, - IPV6_FORWARDING, &im6o, NULL); + IPV6_FORWARDING, &im6o, NULL, NULL); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on mif %d err %d\n", mifp - mif6table, error); #endif splx(s); return; } /* * If we belong to the destination multicast group * on the outgoing interface, loop back a copy. */ dst6 = (struct sockaddr_in6 *)&ro.ro_dst; IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m); if (in6m != NULL) { dst6->sin6_len = sizeof(struct sockaddr_in6); dst6->sin6_family = AF_INET6; dst6->sin6_addr = ip6->ip6_dst; ip6_mloopback(ifp, m, (struct sockaddr_in6 *)&ro.ro_dst); } /* * Put the packet into the sending queue of the outgoing interface * if it would fit in the MTU of the interface. */ if (mb_copy->m_pkthdr.len < ifp->if_mtu || ifp->if_mtu < IPV6_MMTU) { dst6->sin6_len = sizeof(struct sockaddr_in6); dst6->sin6_family = AF_INET6; dst6->sin6_addr = ip6->ip6_dst; /* * We just call if_output instead of nd6_output here, since * we need no ND for a multicast forwarded packet...right? */ error = (*ifp->if_output)(ifp, mb_copy, (struct sockaddr *)&ro.ro_dst, NULL); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on mif %d err %d\n", mifp - mif6table, error); #endif } else { #ifdef MULTICAST_PMTUD icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); #else #ifdef MRT6DEBUG if (mrt6debug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send: packet too big on %s o %s g %s" " size %d(discarded)\n", if_name(ifp), ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst), mb_copy->m_pkthdr.len); #endif /* MRT6DEBUG */ m_freem(mb_copy); /* simply discard the packet */ #endif } splx(s); } static int register_send(ip6, mif, m) struct ip6_hdr *ip6; struct mif6 *mif; struct mbuf *m; { struct mbuf *mm; int i, len = m->m_pkthdr.len; static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 }; struct mrt6msg *im6; #ifdef MRT6DEBUG if (mrt6debug) log(LOG_DEBUG, "** IPv6 register_send **\n src %s dst %s\n", ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst)); #endif ++pim6stat.pim6s_snd_registers; /* Make a copy of the packet to send to the user level process */ MGETHDR(mm, M_DONTWAIT, MT_HEADER); if (mm == NULL) return ENOBUFS; mm->m_pkthdr.rcvif = NULL; mm->m_data += max_linkhdr; mm->m_len = sizeof(struct ip6_hdr); if ((mm->m_next = m_copy(m, 0, M_COPYALL)) == NULL) { m_freem(mm); return ENOBUFS; } i = MHLEN - M_LEADINGSPACE(mm); if (i > len) i = len; mm = m_pullup(mm, i); if (mm == NULL) return ENOBUFS; /* TODO: check it! */ mm->m_pkthdr.len = len + sizeof(struct ip6_hdr); /* * Send message to routing daemon */ sin6.sin6_addr = ip6->ip6_src; im6 = mtod(mm, struct mrt6msg *); im6->im6_msgtype = MRT6MSG_WHOLEPKT; im6->im6_mbz = 0; im6->im6_mif = mif - mif6table; /* iif info is not given for reg. encap.n */ mrt6stat.mrt6s_upcalls++; if (socket_send(ip6_mrouter, mm, &sin6) < 0) { #ifdef MRT6DEBUG if (mrt6debug) log(LOG_WARNING, "register_send: ip6_mrouter socket queue full\n"); #endif ++mrt6stat.mrt6s_upq_sockfull; return ENOBUFS; } return 0; } /* * PIM sparse mode hook * Receives the pim control messages, and passes them up to the listening * socket, using rip6_input. * The only message processed is the REGISTER pim message; the pim header * is stripped off, and the inner packet is passed to register_mforward. */ int pim6_input(mp, offp, proto) struct mbuf **mp; int *offp, proto; { struct pim *pim; /* pointer to a pim struct */ struct ip6_hdr *ip6; int pimlen; struct mbuf *m = *mp; int minlen; int off = *offp; ++pim6stat.pim6s_rcv_total; ip6 = mtod(m, struct ip6_hdr *); pimlen = m->m_pkthdr.len - *offp; /* * Validate lengths */ if (pimlen < PIM_MINLEN) { ++pim6stat.pim6s_rcv_tooshort; #ifdef MRT6DEBUG if (mrt6debug & DEBUG_PIM) log(LOG_DEBUG,"pim6_input: PIM packet too short\n"); #endif m_freem(m); return(IPPROTO_DONE); } /* * if the packet is at least as big as a REGISTER, go ahead * and grab the PIM REGISTER header size, to avoid another * possible m_pullup() later. * * PIM_MINLEN == pimhdr + u_int32 == 8 * PIM6_REG_MINLEN == pimhdr + reghdr + eip6hdr == 4 + 4 + 40 */ minlen = (pimlen >= PIM6_REG_MINLEN) ? PIM6_REG_MINLEN : PIM_MINLEN; /* * Make sure that the IP6 and PIM headers in contiguous memory, and * possibly the PIM REGISTER header */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, minlen, IPPROTO_DONE); /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf to point to the PIM header */ pim = (struct pim *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(pim, struct pim *, m, off, minlen); if (pim == NULL) { pim6stat.pim6s_rcv_tooshort++; return IPPROTO_DONE; } #endif #define PIM6_CHECKSUM #ifdef PIM6_CHECKSUM { int cksumlen; /* * Validate checksum. * If PIM REGISTER, exclude the data packet */ if (pim->pim_type == PIM_REGISTER) cksumlen = PIM_MINLEN; else cksumlen = pimlen; if (in6_cksum(m, IPPROTO_PIM, off, cksumlen)) { ++pim6stat.pim6s_rcv_badsum; #ifdef MRT6DEBUG if (mrt6debug & DEBUG_PIM) log(LOG_DEBUG, "pim6_input: invalid checksum\n"); #endif m_freem(m); return(IPPROTO_DONE); } } #endif /* PIM_CHECKSUM */ /* PIM version check */ if (pim->pim_ver != PIM_VERSION) { ++pim6stat.pim6s_rcv_badversion; #ifdef MRT6DEBUG log(LOG_ERR, "pim6_input: incorrect version %d, expecting %d\n", pim->pim_ver, PIM_VERSION); #endif m_freem(m); return(IPPROTO_DONE); } if (pim->pim_type == PIM_REGISTER) { /* * since this is a REGISTER, we'll make a copy of the register * headers ip6+pim+u_int32_t+encap_ip6, to be passed up to the * routing daemon. */ static struct sockaddr_in6 dst = { sizeof(dst), AF_INET6 }; struct mbuf *mcp; struct ip6_hdr *eip6; u_int32_t *reghdr; int rc; ++pim6stat.pim6s_rcv_registers; if ((reg_mif_num >= nummifs) || (reg_mif_num == (mifi_t) -1)) { #ifdef MRT6DEBUG if (mrt6debug & DEBUG_PIM) log(LOG_DEBUG, "pim6_input: register mif not set: %d\n", reg_mif_num); #endif m_freem(m); return(IPPROTO_DONE); } reghdr = (u_int32_t *)(pim + 1); if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) goto pim6_input_to_daemon; /* * Validate length */ if (pimlen < PIM6_REG_MINLEN) { ++pim6stat.pim6s_rcv_tooshort; ++pim6stat.pim6s_rcv_badregisters; #ifdef MRT6DEBUG log(LOG_ERR, "pim6_input: register packet size too " "small %d from %s\n", pimlen, ip6_sprintf(&ip6->ip6_src)); #endif m_freem(m); return(IPPROTO_DONE); } eip6 = (struct ip6_hdr *) (reghdr + 1); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_PIM) log(LOG_DEBUG, "pim6_input[register], eip6: %s -> %s, " "eip6 plen %d\n", ip6_sprintf(&eip6->ip6_src), ip6_sprintf(&eip6->ip6_dst), ntohs(eip6->ip6_plen)); #endif /* verify the version number of the inner packet */ if ((eip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { ++pim6stat.pim6s_rcv_badregisters; #ifdef MRT6DEBUG log(LOG_DEBUG, "pim6_input: invalid IP version (%d) " "of the inner packet\n", (eip6->ip6_vfc & IPV6_VERSION)); #endif m_freem(m); return(IPPROTO_NONE); } /* verify the inner packet is destined to a mcast group */ if (!IN6_IS_ADDR_MULTICAST(&eip6->ip6_dst)) { ++pim6stat.pim6s_rcv_badregisters; #ifdef MRT6DEBUG if (mrt6debug & DEBUG_PIM) log(LOG_DEBUG, "pim6_input: inner packet of register " "is not multicast %s\n", ip6_sprintf(&eip6->ip6_dst)); #endif m_freem(m); return(IPPROTO_DONE); } /* * make a copy of the whole header to pass to the daemon later. */ mcp = m_copy(m, 0, off + PIM6_REG_MINLEN); if (mcp == NULL) { #ifdef MRT6DEBUG log(LOG_ERR, "pim6_input: pim register: " "could not copy register head\n"); #endif m_freem(m); return(IPPROTO_DONE); } /* * forward the inner ip6 packet; point m_data at the inner ip6. */ m_adj(m, off + PIM_MINLEN); #ifdef MRT6DEBUG if (mrt6debug & DEBUG_PIM) { log(LOG_DEBUG, "pim6_input: forwarding decapsulated register: " "src %s, dst %s, mif %d\n", ip6_sprintf(&eip6->ip6_src), ip6_sprintf(&eip6->ip6_dst), reg_mif_num); } #endif rc = if_simloop(mif6table[reg_mif_num].m6_ifp, m, dst.sin6_family, 0); /* prepare the register head to send to the mrouting daemon */ m = mcp; } /* * Pass the PIM message up to the daemon; if it is a register message * pass the 'head' only up to the daemon. This includes the * encapsulator ip6 header, pim header, register header and the * encapsulated ip6 header. */ pim6_input_to_daemon: rip6_input(&m, offp, proto); return(IPPROTO_DONE); } Index: head/sys/netinet6/ip6_output.c =================================================================== --- head/sys/netinet6/ip6_output.c (revision 105193) +++ head/sys/netinet6/ip6_output.c (revision 105194) @@ -1,2558 +1,2556 @@ /* $FreeBSD$ */ /* $KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include "opt_ip6fw.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_pfil_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PFIL_HOOKS #include #endif #include #include #include #include #include #include #include #include #ifdef IPSEC #include #ifdef INET6 #include #endif #include #endif /* IPSEC */ #include #include #include static MALLOC_DEFINE(M_IPMOPTS, "ip6_moptions", "internet multicast options"); struct ip6_exthdrs { struct mbuf *ip6e_ip6; struct mbuf *ip6e_hbh; struct mbuf *ip6e_dest1; struct mbuf *ip6e_rthdr; struct mbuf *ip6e_dest2; }; static int ip6_pcbopts __P((struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *sopt)); static int ip6_setmoptions __P((int, struct ip6_moptions **, struct mbuf *)); static int ip6_getmoptions __P((int, struct ip6_moptions *, struct mbuf **)); static int ip6_copyexthdr __P((struct mbuf **, caddr_t, int)); static int ip6_insertfraghdr __P((struct mbuf *, struct mbuf *, int, struct ip6_frag **)); static int ip6_insert_jumboopt __P((struct ip6_exthdrs *, u_int32_t)); static int ip6_splithdr __P((struct mbuf *, struct ip6_exthdrs *)); /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 * header (with pri, len, nxt, hlim, src, dst). * This function may modify ver and hlim only. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * * type of "mtu": rt_rmx.rmx_mtu is u_long, ifnet.ifr_mtu is int, and * nd_ifinfo.linkmtu is u_int32_t. so we use u_long to hold largest one, * which is rt_rmx.rmx_mtu. */ int -ip6_output(m0, opt, ro, flags, im6o, ifpp) +ip6_output(m0, opt, ro, flags, im6o, ifpp, inp) struct mbuf *m0; struct ip6_pktopts *opt; struct route_in6 *ro; int flags; struct ip6_moptions *im6o; struct ifnet **ifpp; /* XXX: just for statistics */ + struct inpcb *inp; { struct ip6_hdr *ip6, *mhip6; struct ifnet *ifp, *origifp; struct mbuf *m = m0; int hlen, tlen, len, off; struct route_in6 ip6route; struct sockaddr_in6 *dst; int error = 0; struct in6_ifaddr *ia = NULL; u_long mtu; u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_exthdrs exthdrs; struct in6_addr finaldst; struct route_in6 *ro_pmtu = NULL; int hdrsplit = 0; int needipsec = 0; #ifdef PFIL_HOOKS struct packet_filter_hook *pfh; struct mbuf *m1; int rv; #endif /* PFIL_HOOKS */ #ifdef IPSEC int needipsectun = 0; - struct socket *so; struct secpolicy *sp = NULL; + struct socket *so = inp ? inp->inp_socket : NULL; - /* for AH processing. stupid to have "socket" variable in IP layer... */ - so = ipsec_getsocket(m); - (void)ipsec_setsocket(m, NULL); ip6 = mtod(m, struct ip6_hdr *); #endif /* IPSEC */ #define MAKE_EXTHDR(hp, mp) \ do { \ if (hp) { \ struct ip6_ext *eh = (struct ip6_ext *)(hp); \ error = ip6_copyexthdr((mp), (caddr_t)(hp), \ ((eh)->ip6e_len + 1) << 3); \ if (error) \ goto freehdrs; \ } \ } while (0) bzero(&exthdrs, sizeof(exthdrs)); if (opt) { /* Hop-by-Hop options header */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh); /* Destination options header(1st part) */ MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); /* Routing header */ MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr); /* Destination options header(2nd part) */ MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2); } #ifdef IPSEC /* get a security policy for this packet */ if (so == NULL) sp = ipsec6_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, 0, &error); else sp = ipsec6_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error); if (sp == NULL) { ipsec6stat.out_inval++; goto freehdrs; } error = 0; /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: /* * This packet is just discarded. */ ipsec6stat.out_polvio++; goto freehdrs; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: /* no need to do IPsec. */ needipsec = 0; break; case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { /* acquire a policy */ error = key_spdacquire(sp); goto freehdrs; } needipsec = 1; break; case IPSEC_POLICY_ENTRUST: default: printf("ip6_output: Invalid policy found. %d\n", sp->policy); } #endif /* IPSEC */ /* * Calculate the total length of the extension header chain. * Keep the length of the unfragmentable part for fragmentation. */ optlen = 0; if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len; unfragpartlen = optlen + sizeof(struct ip6_hdr); /* NOTE: we don't add AH/ESP length here. do that later. */ if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len; /* * If we need IPsec, or there is at least one extension header, * separate IP6 header from the payload. */ if ((needipsec || optlen) && !hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf packet header length */ m->m_pkthdr.len += optlen; plen = m->m_pkthdr.len - sizeof(*ip6); /* If this is a jumbo payload, insert a jumbo payload option. */ if (plen > IPV6_MAXPACKET) { if (!hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) goto freehdrs; ip6->ip6_plen = 0; } else ip6->ip6_plen = htons(plen); /* * Concatenate headers and fill in next header fields. * Here we have, on "m" * IPv6 payload * and we insert headers accordingly. Finally, we should be getting: * IPv6 hbh dest1 rthdr ah* [esp* dest2 payload] * * during the header composing process, "m" points to IPv6 header. * "mprev" points to an extension header prior to esp. */ { u_char *nexthdrp = &ip6->ip6_nxt; struct mbuf *mprev = m; /* * we treat dest2 specially. this makes IPsec processing * much easier. the goal here is to make mprev point the * mbuf prior to dest2. * * result: IPv6 dest2 payload * m and mprev will point to IPv6 header. */ if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("assumption failed: hdr not split"); exthdrs.ip6e_dest2->m_next = m->m_next; m->m_next = exthdrs.ip6e_dest2; *mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_DSTOPTS; } #define MAKE_CHAIN(m, mp, p, i)\ do {\ if (m) {\ if (!hdrsplit) \ panic("assumption failed: hdr not split"); \ *mtod((m), u_char *) = *(p);\ *(p) = (i);\ p = mtod((m), u_char *);\ (m)->m_next = (mp)->m_next;\ (mp)->m_next = (m);\ (mp) = (m);\ }\ } while (0) /* * result: IPv6 hbh dest1 rthdr dest2 payload * m will point to IPv6 header. mprev will point to the * extension header prior to dest2 (rthdr in the above case). */ MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp, IPPROTO_DSTOPTS); MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp, IPPROTO_ROUTING); #ifdef IPSEC if (!needipsec) goto skip_ipsec2; /* * pointers after IPsec headers are not valid any more. * other pointers need a great care too. * (IPsec routines should not mangle mbufs prior to AH/ESP) */ exthdrs.ip6e_dest2 = NULL; { struct ip6_rthdr *rh = NULL; int segleft_org = 0; struct ipsec_output_state state; if (exthdrs.ip6e_rthdr) { rh = mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *); segleft_org = rh->ip6r_segleft; rh->ip6r_segleft = 0; } bzero(&state, sizeof(state)); state.m = m; error = ipsec6_output_trans(&state, nexthdrp, mprev, sp, flags, &needipsectun); m = state.m; if (error) { /* mbuf is already reclaimed in ipsec6_output_trans. */ m = NULL; switch (error) { case EHOSTUNREACH: case ENETUNREACH: case EMSGSIZE: case ENOBUFS: case ENOMEM: break; default: printf("ip6_output (ipsec): error code %d\n", error); /* fall through */ case ENOENT: /* don't show these error codes to the user */ error = 0; break; } goto bad; } if (exthdrs.ip6e_rthdr) { /* ah6_output doesn't modify mbuf chain */ rh->ip6r_segleft = segleft_org; } } skip_ipsec2:; #endif } /* * If there is a routing header, replace destination address field * with the first hop of the routing header. */ if (exthdrs.ip6e_rthdr) { struct ip6_rthdr *rh = (struct ip6_rthdr *)(mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *)); struct ip6_rthdr0 *rh0; finaldst = ip6->ip6_dst; switch (rh->ip6r_type) { case IPV6_RTHDR_TYPE_0: rh0 = (struct ip6_rthdr0 *)rh; ip6->ip6_dst = rh0->ip6r0_addr[0]; bcopy((caddr_t)&rh0->ip6r0_addr[1], (caddr_t)&rh0->ip6r0_addr[0], sizeof(struct in6_addr)*(rh0->ip6r0_segleft - 1) ); rh0->ip6r0_addr[rh0->ip6r0_segleft - 1] = finaldst; break; default: /* is it possible? */ error = EINVAL; goto bad; } } /* Source address validation */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && (flags & IPV6_DADOUTPUT) == 0) { error = EOPNOTSUPP; ip6stat.ip6s_badscope++; goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { error = EOPNOTSUPP; ip6stat.ip6s_badscope++; goto bad; } ip6stat.ip6s_localout++; /* * Route packet. */ if (ro == 0) { ro = &ip6route; bzero((caddr_t)ro, sizeof(*ro)); } ro_pmtu = ro; if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route; dst = (struct sockaddr_in6 *)&ro->ro_dst; /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. */ if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || dst->sin6_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&dst->sin6_addr, &ip6->ip6_dst))) { RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)0; } if (ro->ro_rt == 0) { bzero(dst, sizeof(*dst)); dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(struct sockaddr_in6); dst->sin6_addr = ip6->ip6_dst; #ifdef SCOPEDROUTING /* XXX: sin6_scope_id should already be fixed at this point */ if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr)) dst->sin6_scope_id = ntohs(dst->sin6_addr.s6_addr16[1]); #endif } #ifdef IPSEC if (needipsec && needipsectun) { struct ipsec_output_state state; /* * All the extension headers will become inaccessible * (since they can be encrypted). * Don't panic, we need no more updates to extension headers * on inner IPv6 packet (since they are now encapsulated). * * IPv6 [ESP|AH] IPv6 [extension headers] payload */ bzero(&exthdrs, sizeof(exthdrs)); exthdrs.ip6e_ip6 = m; bzero(&state, sizeof(state)); state.m = m; state.ro = (struct route *)ro; state.dst = (struct sockaddr *)dst; error = ipsec6_output_tunnel(&state, sp, flags); m = state.m; ro = (struct route_in6 *)state.ro; dst = (struct sockaddr_in6 *)state.dst; if (error) { /* mbuf is already reclaimed in ipsec6_output_tunnel. */ m0 = m = NULL; m = NULL; switch (error) { case EHOSTUNREACH: case ENETUNREACH: case EMSGSIZE: case ENOBUFS: case ENOMEM: break; default: printf("ip6_output (ipsec): error code %d\n", error); /* fall through */ case ENOENT: /* don't show these error codes to the user */ error = 0; break; } goto bad; } exthdrs.ip6e_ip6 = m; } #endif /* IPSEC */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* Unicast */ #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) #define sin6tosa(sin6) ((struct sockaddr *)(sin6)) /* xxx * interface selection comes here * if an interface is specified from an upper layer, * ifp must point it. */ if (ro->ro_rt == 0) { /* * non-bsdi always clone routes, if parent is * PRF_CLONING. */ rtalloc((struct route *)ro); } if (ro->ro_rt == 0) { ip6stat.ip6s_noroute++; error = EHOSTUNREACH; /* XXX in6_ifstat_inc(ifp, ifs6_out_discard); */ goto bad; } ia = ifatoia6(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_use++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in6 *)ro->ro_rt->rt_gateway; m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ in6_ifstat_inc(ifp, ifs6_out_request); /* * Check if the outgoing interface conflicts with * the interface specified by ifi6_ifindex (if specified). * Note that loopback interface is always okay. * (this may happen when we are sending a packet to one of * our own addresses.) */ if (opt && opt->ip6po_pktinfo && opt->ip6po_pktinfo->ipi6_ifindex) { if (!(ifp->if_flags & IFF_LOOPBACK) && ifp->if_index != opt->ip6po_pktinfo->ipi6_ifindex) { ip6stat.ip6s_noroute++; in6_ifstat_inc(ifp, ifs6_out_discard); error = EHOSTUNREACH; goto bad; } } if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; } else { /* Multicast */ struct in6_multi *in6m; m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; /* * See if the caller provided any multicast options */ ifp = NULL; if (im6o != NULL) { ip6->ip6_hlim = im6o->im6o_multicast_hlim; if (im6o->im6o_multicast_ifp != NULL) ifp = im6o->im6o_multicast_ifp; } else ip6->ip6_hlim = ip6_defmcasthlim; /* * See if the caller provided the outgoing interface * as an ancillary data. * Boundary check for ifindex is assumed to be already done. */ if (opt && opt->ip6po_pktinfo && opt->ip6po_pktinfo->ipi6_ifindex) ifp = ifnet_byindex(opt->ip6po_pktinfo->ipi6_ifindex); /* * If the destination is a node-local scope multicast, * the packet should be loop-backed only. */ if (IN6_IS_ADDR_MC_NODELOCAL(&ip6->ip6_dst)) { /* * If the outgoing interface is already specified, * it should be a loopback interface. */ if (ifp && (ifp->if_flags & IFF_LOOPBACK) == 0) { ip6stat.ip6s_badscope++; error = ENETUNREACH; /* XXX: better error? */ /* XXX correct ifp? */ in6_ifstat_inc(ifp, ifs6_out_discard); goto bad; } else { ifp = &loif[0]; } } if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; /* * If caller did not provide an interface lookup a * default in the routing table. This is either a * default for the speicfied group (i.e. a host * route), or a multicast default (a route for the * ``net'' ff00::/8). */ if (ifp == NULL) { if (ro->ro_rt == 0) { ro->ro_rt = rtalloc1((struct sockaddr *) &ro->ro_dst, 0, 0UL); } if (ro->ro_rt == 0) { ip6stat.ip6s_noroute++; error = EHOSTUNREACH; /* XXX in6_ifstat_inc(ifp, ifs6_out_discard) */ goto bad; } ia = ifatoia6(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_use++; } if ((flags & IPV6_FORWARDING) == 0) in6_ifstat_inc(ifp, ifs6_out_request); in6_ifstat_inc(ifp, ifs6_out_mcast); /* * Confirm that the outgoing interface supports multicast. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { ip6stat.ip6s_noroute++; in6_ifstat_inc(ifp, ifs6_out_discard); error = ENETUNREACH; goto bad; } IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m); if (in6m != NULL && (im6o == NULL || im6o->im6o_multicast_loop)) { /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not * forbid loopback, loop back a copy. */ ip6_mloopback(ifp, m, dst); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IPV6_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip6_mloopback(), * above, will be forwarded by the ip6_input() routine, * if necessary. */ if (ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip6_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK)) { m_freem(m); goto done; } } /* * Fill the outgoing inteface to tell the upper layer * to increment per-interface statistics. */ if (ifpp) *ifpp = ifp; /* * Determine path MTU. */ if (ro_pmtu != ro) { /* The first hop and the final destination may differ. */ struct sockaddr_in6 *sin6_fin = (struct sockaddr_in6 *)&ro_pmtu->ro_dst; if (ro_pmtu->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || !IN6_ARE_ADDR_EQUAL(&sin6_fin->sin6_addr, &finaldst))) { RTFREE(ro_pmtu->ro_rt); ro_pmtu->ro_rt = (struct rtentry *)0; } if (ro_pmtu->ro_rt == 0) { bzero(sin6_fin, sizeof(*sin6_fin)); sin6_fin->sin6_family = AF_INET6; sin6_fin->sin6_len = sizeof(struct sockaddr_in6); sin6_fin->sin6_addr = finaldst; rtalloc((struct route *)ro_pmtu); } } if (ro_pmtu->ro_rt != NULL) { u_int32_t ifmtu = nd_ifinfo[ifp->if_index].linkmtu; mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu; if (mtu > ifmtu || mtu == 0) { /* * The MTU on the route is larger than the MTU on * the interface! This shouldn't happen, unless the * MTU of the interface has been changed after the * interface was brought up. Change the MTU in the * route to match the interface MTU (as long as the * field isn't locked). * * if MTU on the route is 0, we need to fix the MTU. * this case happens with path MTU discovery timeouts. */ mtu = ifmtu; if ((ro_pmtu->ro_rt->rt_rmx.rmx_locks & RTV_MTU) == 0) ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu; /* XXX */ } } else { mtu = nd_ifinfo[ifp->if_index].linkmtu; } /* * advanced API (IPV6_USE_MIN_MTU) overrides mtu setting */ if ((flags & IPV6_MINMTU) != 0 && mtu > IPV6_MMTU) mtu = IPV6_MMTU; /* Fake scoped addresses */ if ((ifp->if_flags & IFF_LOOPBACK) != 0) { /* * If source or destination address is a scoped address, and * the packet is going to be sent to a loopback interface, * we should keep the original interface. */ /* * XXX: this is a very experimental and temporary solution. * We eventually have sockaddr_in6 and use the sin6_scope_id * field of the structure here. * We rely on the consistency between two scope zone ids * of source and destination, which should already be assured. * Larger scopes than link will be supported in the future. */ origifp = NULL; if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) origifp = ifnet_byindex(ntohs(ip6->ip6_src.s6_addr16[1])); else if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) origifp = ifnet_byindex(ntohs(ip6->ip6_dst.s6_addr16[1])); /* * XXX: origifp can be NULL even in those two cases above. * For example, if we remove the (only) link-local address * from the loopback interface, and try to send a link-local * address without link-id information. Then the source * address is ::1, and the destination address is the * link-local address with its s6_addr16[1] being zero. * What is worse, if the packet goes to the loopback interface * by a default rejected route, the null pointer would be * passed to looutput, and the kernel would hang. * The following last resort would prevent such disaster. */ if (origifp == NULL) origifp = ifp; } else origifp = ifp; #ifndef SCOPEDROUTING /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); #endif /* * Check with the firewall... */ if (ip6_fw_enable && ip6_fw_chk_ptr) { u_short port = 0; m->m_pkthdr.rcvif = NULL; /* XXX */ /* If ipfw says divert, we have to just drop packet */ if ((*ip6_fw_chk_ptr)(&ip6, ifp, &port, &m)) { m_freem(m); goto done; } if (!m) { error = EACCES; goto done; } } /* * If the outgoing packet contains a hop-by-hop options header, * it must be examined and processed even by the source node. * (RFC 2460, section 4.) */ if (exthdrs.ip6e_hbh) { struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *); u_int32_t dummy1; /* XXX unused */ u_int32_t dummy2; /* XXX unused */ #ifdef DIAGNOSTIC if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len) panic("ip6e_hbh is not continuous"); #endif /* * XXX: if we have to send an ICMPv6 error to the sender, * we need the M_LOOP flag since icmp6_error() expects * the IPv6 and the hop-by-hop options header are * continuous unless the flag is set. */ m->m_flags |= M_LOOP; m->m_pkthdr.rcvif = ifp; if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1), ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh), &dummy1, &dummy2) < 0) { /* m was already freed at this point */ error = EINVAL;/* better error? */ goto done; } m->m_flags &= ~M_LOOP; /* XXX */ m->m_pkthdr.rcvif = NULL; } #ifdef PFIL_HOOKS /* * Run through list of hooks for output packets. */ m1 = m; pfh = pfil_hook_get(PFIL_OUT, &inet6sw[ip6_protox[IPPROTO_IPV6]].pr_pfh); for (; pfh; pfh = pfh->pfil_link.tqe_next) if (pfh->pfil_func) { rv = pfh->pfil_func(ip6, sizeof(*ip6), ifp, 1, &m1); if (rv) { error = EHOSTUNREACH; goto done; } m = m1; if (m == NULL) goto done; ip6 = mtod(m, struct ip6_hdr *); } #endif /* PFIL_HOOKS */ /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. */ tlen = m->m_pkthdr.len; if (tlen <= mtu #ifdef notyet /* * On any link that cannot convey a 1280-octet packet in one piece, * link-specific fragmentation and reassembly must be provided at * a layer below IPv6. [RFC 2460, sec.5] * Thus if the interface has ability of link-level fragmentation, * we can just send the packet even if the packet size is * larger than the link's MTU. * XXX: IFF_FRAGMENTABLE (or such) flag has not been defined yet... */ || ifp->if_flags & IFF_FRAGMENTABLE #endif ) { /* Record statistics for this interface address. */ if (ia && !(flags & IPV6_FORWARDING)) { ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } #ifdef IPSEC /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); goto done; } else if (mtu < IPV6_MMTU) { /* * note that path MTU is never less than IPV6_MMTU * (see icmp6_input). */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else if (ip6->ip6_plen == 0) { /* jumbo payload cannot be fragmented */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else { struct mbuf **mnext, *m_frgpart; struct ip6_frag *ip6f; u_int32_t id = htonl(ip6_id++); u_char nextproto; /* * Too large for the destination or interface; * fragment if possible. * Must be able to put at least 8 bytes per fragment. */ hlen = unfragpartlen; if (mtu > IPV6_MAXPACKET) mtu = IPV6_MAXPACKET; len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7; if (len < 8) { error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } mnext = &m->m_nextpkt; /* * Change the next header field of the last header in the * unfragmentable part. */ if (exthdrs.ip6e_rthdr) { nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *); *mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_dest1) { nextproto = *mtod(exthdrs.ip6e_dest1, u_char *); *mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_hbh) { nextproto = *mtod(exthdrs.ip6e_hbh, u_char *); *mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT; } else { nextproto = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; } /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto * chain. */ m0 = m; for (off = hlen; off < tlen; off += len) { MGETHDR(m, M_DONTWAIT, MT_HEADER); if (!m) { error = ENOBUFS; ip6stat.ip6s_odropped++; goto sendorfree; } m->m_pkthdr.rcvif = NULL; m->m_flags = m0->m_flags & M_COPYFLAGS; *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; mhip6 = mtod(m, struct ip6_hdr *); *mhip6 = *ip6; m->m_len = sizeof(*mhip6); error = ip6_insertfraghdr(m0, m, hlen, &ip6f); if (error) { ip6stat.ip6s_odropped++; goto sendorfree; } ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7)); if (off + len >= tlen) len = tlen - off; else ip6f->ip6f_offlg |= IP6F_MORE_FRAG; mhip6->ip6_plen = htons((u_short)(len + hlen + sizeof(*ip6f) - sizeof(struct ip6_hdr))); if ((m_frgpart = m_copy(m0, off, len)) == 0) { error = ENOBUFS; ip6stat.ip6s_odropped++; goto sendorfree; } m_cat(m, m_frgpart); m->m_pkthdr.len = len + hlen + sizeof(*ip6f); m->m_pkthdr.rcvif = (struct ifnet *)0; ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; ip6stat.ip6s_ofragments++; in6_ifstat_inc(ifp, ifs6_out_fragcreat); } in6_ifstat_inc(ifp, ifs6_out_fragok); } /* * Remove leading garbages. */ sendorfree: m = m0->m_nextpkt; m0->m_nextpkt = 0; m_freem(m0); for (m0 = m; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia) { ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } #ifdef IPSEC /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); } else m_freem(m); } if (error == 0) ip6stat.ip6s_fragmented++; done: if (ro == &ip6route && ro->ro_rt) { /* brace necessary for RTFREE */ RTFREE(ro->ro_rt); } else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) { RTFREE(ro_pmtu->ro_rt); } #ifdef IPSEC if (sp != NULL) key_freesp(sp); #endif /* IPSEC */ return(error); freehdrs: m_freem(exthdrs.ip6e_hbh); /* m_freem will check if mbuf is 0 */ m_freem(exthdrs.ip6e_dest1); m_freem(exthdrs.ip6e_rthdr); m_freem(exthdrs.ip6e_dest2); /* fall through */ bad: m_freem(m); goto done; } static int ip6_copyexthdr(mp, hdr, hlen) struct mbuf **mp; caddr_t hdr; int hlen; { struct mbuf *m; if (hlen > MCLBYTES) return(ENOBUFS); /* XXX */ MGET(m, M_DONTWAIT, MT_DATA); if (!m) return(ENOBUFS); if (hlen > MLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return(ENOBUFS); } } m->m_len = hlen; if (hdr) bcopy(hdr, mtod(m, caddr_t), hlen); *mp = m; return(0); } /* * Insert jumbo payload option. */ static int ip6_insert_jumboopt(exthdrs, plen) struct ip6_exthdrs *exthdrs; u_int32_t plen; { struct mbuf *mopt; u_char *optbuf; u_int32_t v; #define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */ /* * If there is no hop-by-hop options header, allocate new one. * If there is one but it doesn't have enough space to store the * jumbo payload option, allocate a cluster to store the whole options. * Otherwise, use it to store the options. */ if (exthdrs->ip6e_hbh == 0) { MGET(mopt, M_DONTWAIT, MT_DATA); if (mopt == 0) return(ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_char *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ exthdrs->ip6e_hbh = mopt; } else { struct ip6_hbh *hbh; mopt = exthdrs->ip6e_hbh; if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) { /* * XXX assumption: * - exthdrs->ip6e_hbh is not referenced from places * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ int oldoptlen = mopt->m_len; struct mbuf *n; /* * XXX: give up if the whole (new) hbh header does * not fit even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) return(ENOBUFS); /* * As a consequence, we must always prepare a cluster * at this point. */ MGET(n, M_DONTWAIT, MT_DATA); if (n) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; } } if (!n) return(ENOBUFS); n->m_len = oldoptlen + JUMBOOPTLEN; bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t), oldoptlen); optbuf = mtod(n, caddr_t) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { optbuf = mtod(mopt, u_char *) + mopt->m_len; mopt->m_len += JUMBOOPTLEN; } optbuf[0] = IP6OPT_PADN; optbuf[1] = 1; /* * Adjust the header length according to the pad and * the jumbo payload option. */ hbh = mtod(mopt, struct ip6_hbh *); hbh->ip6h_len += (JUMBOOPTLEN >> 3); } /* fill in the option. */ optbuf[2] = IP6OPT_JUMBO; optbuf[3] = 4; v = (u_int32_t)htonl(plen + JUMBOOPTLEN); bcopy(&v, &optbuf[4], sizeof(u_int32_t)); /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; return(0); #undef JUMBOOPTLEN } /* * Insert fragment header and copy unfragmentable header portions. */ static int ip6_insertfraghdr(m0, m, hlen, frghdrp) struct mbuf *m0, *m; int hlen; struct ip6_frag **frghdrp; { struct mbuf *n, *mlast; if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), hlen - sizeof(struct ip6_hdr), M_DONTWAIT); if (n == 0) return(ENOBUFS); m->m_next = n; } else n = m; /* Search for the last mbuf of unfragmentable part. */ for (mlast = n; mlast->m_next; mlast = mlast->m_next) ; if ((mlast->m_flags & M_EXT) == 0 && M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for the fragment hdr */ *frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); m->m_pkthdr.len += sizeof(struct ip6_frag); } else { /* allocate a new mbuf for the fragment header */ struct mbuf *mfrg; MGET(mfrg, M_DONTWAIT, MT_DATA); if (mfrg == 0) return(ENOBUFS); mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } return(0); } /* * IP6 socket option processing. */ int ip6_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { int privileged; struct inpcb *in6p = sotoinpcb(so); int error, optval; int level, op, optname; int optlen; struct thread *td; if (sopt) { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; td = sopt->sopt_td; } else { panic("ip6_ctloutput: arg soopt is NULL"); } error = optval = 0; privileged = (td == 0 || suser(td)) ? 0 : 1; if (level == IPPROTO_IPV6) { switch (op) { case SOPT_SET: switch (optname) { case IPV6_PKTOPTIONS: { struct mbuf *m; error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; error = ip6_pcbopts(&in6p->in6p_outputopts, m, so, sopt); m_freem(m); /* XXX */ break; } /* * Use of some Hop-by-Hop options or some * Destination options, might require special * privilege. That is, normal applications * (without special privilege) might be forbidden * from setting certain options in outgoing packets, * and might never see certain options in received * packets. [RFC 2292 Section 6] * KAME specific note: * KAME prevents non-privileged users from sending or * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ case IPV6_UNICAST_HOPS: case IPV6_CHECKSUM: case IPV6_FAITH: case IPV6_V6ONLY: if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_UNICAST_HOPS: if (optval < -1 || optval >= 256) error = EINVAL; else { /* -1 = kernel default */ in6p->in6p_hops = optval; if ((in6p->in6p_vflag & INP_IPV4) != 0) in6p->inp_ip_ttl = optval; } break; #define OPTSET(bit) \ do { \ if (optval) \ in6p->in6p_flags |= (bit); \ else \ in6p->in6p_flags &= ~(bit); \ } while (0) #define OPTBIT(bit) (in6p->in6p_flags & (bit) ? 1 : 0) case IPV6_CHECKSUM: in6p->in6p_cksum = optval; break; case IPV6_FAITH: OPTSET(IN6P_FAITH); break; case IPV6_V6ONLY: /* * make setsockopt(IPV6_V6ONLY) * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ if (in6p->in6p_lport || !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { error = EINVAL; break; } OPTSET(IN6P_IPV6_V6ONLY); if (optval) in6p->in6p_vflag &= ~INP_IPV4; else in6p->in6p_vflag |= INP_IPV4; break; } break; case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_HOPOPTS: case IPV6_DSTOPTS: case IPV6_RTHDR: /* RFC 2292 */ if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_PKTINFO: OPTSET(IN6P_PKTINFO); break; case IPV6_HOPLIMIT: OPTSET(IN6P_HOPLIMIT); break; case IPV6_HOPOPTS: /* * Check super-user privilege. * See comments for IPV6_RECVHOPOPTS. */ if (!privileged) return(EPERM); OPTSET(IN6P_HOPOPTS); break; case IPV6_DSTOPTS: if (!privileged) return(EPERM); OPTSET(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ break; case IPV6_RTHDR: OPTSET(IN6P_RTHDR); break; } break; #undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } /* XXX */ MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_HEADER); if (m == 0) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); error = ip6_setmoptions(sopt->sopt_name, &in6p->in6p_moptions, m); (void)m_free(m); } break; case IPV6_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optval) { case IPV6_PORTRANGE_DEFAULT: in6p->in6p_flags &= ~(IN6P_LOWPORT); in6p->in6p_flags &= ~(IN6P_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: in6p->in6p_flags &= ~(IN6P_LOWPORT); in6p->in6p_flags |= IN6P_HIGHPORT; break; case IPV6_PORTRANGE_LOW: in6p->in6p_flags &= ~(IN6P_HIGHPORT); in6p->in6p_flags |= IN6P_LOWPORT; break; default: error = EINVAL; break; } break; #ifdef IPSEC case IPV6_IPSEC_POLICY: { caddr_t req = NULL; size_t len = 0; struct mbuf *m; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; if (m) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec6_set_policy(in6p, optname, req, len, privileged); m_freem(m); } break; #endif /* KAME IPSEC */ case IPV6_FW_ADD: case IPV6_FW_DEL: case IPV6_FW_FLUSH: case IPV6_FW_ZERO: { struct mbuf *m; struct mbuf **mp = &m; if (ip6_fw_ctl_ptr == NULL) return EINVAL; /* XXX */ if ((error = soopt_getm(sopt, &m)) != 0) break; /* XXX */ if ((error = soopt_mcopyin(sopt, m)) != 0) break; error = (*ip6_fw_ctl_ptr)(optname, mp); m = *mp; } break; default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (optname) { case IPV6_PKTOPTIONS: if (in6p->in6p_options) { struct mbuf *m; m = m_copym(in6p->in6p_options, 0, M_COPYALL, M_TRYWAIT); error = soopt_mcopyout(sopt, m); if (error == 0) m_freem(m); } else sopt->sopt_valsize = 0; break; case IPV6_UNICAST_HOPS: case IPV6_CHECKSUM: case IPV6_FAITH: case IPV6_V6ONLY: case IPV6_PORTRANGE: switch (optname) { case IPV6_UNICAST_HOPS: optval = in6p->in6p_hops; break; case IPV6_CHECKSUM: optval = in6p->in6p_cksum; break; case IPV6_FAITH: optval = OPTBIT(IN6P_FAITH); break; case IPV6_V6ONLY: optval = OPTBIT(IN6P_IPV6_V6ONLY); break; case IPV6_PORTRANGE: { int flags; flags = in6p->in6p_flags; if (flags & IN6P_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & IN6P_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; break; } } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: if (optname == IPV6_HOPOPTS || optname == IPV6_DSTOPTS || !privileged) return(EPERM); switch (optname) { case IPV6_PKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_HOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_HOPOPTS: if (!privileged) return(EPERM); optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_RTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_DSTOPTS: if (!privileged) return(EPERM); optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: { struct mbuf *m; error = ip6_getmoptions(sopt->sopt_name, in6p->in6p_moptions, &m); if (error == 0) error = sooptcopyout(sopt, mtod(m, char *), m->m_len); m_freem(m); } break; #ifdef IPSEC case IPV6_IPSEC_POLICY: { caddr_t req = NULL; size_t len = 0; struct mbuf *m = NULL; struct mbuf **mp = &m; error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; if (m) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec6_get_policy(in6p, req, len, mp); if (error == 0) error = soopt_mcopyout(sopt, m); /*XXX*/ if (error == 0 && m) m_freem(m); break; } #endif /* KAME IPSEC */ case IPV6_FW_GET: { struct mbuf *m; struct mbuf **mp = &m; if (ip6_fw_ctl_ptr == NULL) { return EINVAL; } error = (*ip6_fw_ctl_ptr)(optname, mp); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0 && m) m_freem(m); } break; default: error = ENOPROTOOPT; break; } break; } } else { error = EINVAL; } return(error); } /* * Set up IP6 options in pcb for insertion in output packets or * specifying behavior of outgoing packets. */ static int ip6_pcbopts(pktopt, m, so, sopt) struct ip6_pktopts **pktopt; struct mbuf *m; struct socket *so; struct sockopt *sopt; { struct ip6_pktopts *opt = *pktopt; int error = 0; struct thread *td = sopt->sopt_td; int priv = 0; /* turn off any old options. */ if (opt) { #ifdef DIAGNOSTIC if (opt->ip6po_pktinfo || opt->ip6po_nexthop || opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || opt->ip6po_rhinfo.ip6po_rhi_rthdr) printf("ip6_pcbopts: all specified options are cleared.\n"); #endif ip6_clearpktopts(opt, 1, -1); } else opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK); *pktopt = NULL; if (!m || m->m_len == 0) { /* * Only turning off any previous options. */ if (opt) free(opt, M_IP6OPT); return(0); } /* set options specified by user. */ if (td && !suser(td)) priv = 1; if ((error = ip6_setpktoptions(m, opt, priv, 1)) != 0) { ip6_clearpktopts(opt, 1, -1); /* XXX: discard all options */ return(error); } *pktopt = opt; return(0); } /* * initialize ip6_pktopts. beware that there are non-zero default values in * the struct. */ void init_ip6pktopts(opt) struct ip6_pktopts *opt; { bzero(opt, sizeof(*opt)); opt->ip6po_hlim = -1; /* -1 means default hop limit */ } void ip6_clearpktopts(pktopt, needfree, optname) struct ip6_pktopts *pktopt; int needfree, optname; { if (pktopt == NULL) return; if (optname == -1) { if (needfree && pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT); pktopt->ip6po_pktinfo = NULL; } if (optname == -1) pktopt->ip6po_hlim = -1; if (optname == -1) { if (needfree && pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT); pktopt->ip6po_nexthop = NULL; } if (optname == -1) { if (needfree && pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT); pktopt->ip6po_hbh = NULL; } if (optname == -1) { if (needfree && pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT); pktopt->ip6po_dest1 = NULL; } if (optname == -1) { if (needfree && pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; if (pktopt->ip6po_route.ro_rt) { RTFREE(pktopt->ip6po_route.ro_rt); pktopt->ip6po_route.ro_rt = NULL; } } if (optname == -1) { if (needfree && pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT); pktopt->ip6po_dest2 = NULL; } } #define PKTOPT_EXTHDRCPY(type) \ do {\ if (src->type) {\ int hlen =\ (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\ dst->type = malloc(hlen, M_IP6OPT, canwait);\ if (dst->type == NULL && canwait == M_NOWAIT)\ goto bad;\ bcopy(src->type, dst->type, hlen);\ }\ } while (0) struct ip6_pktopts * ip6_copypktopts(src, canwait) struct ip6_pktopts *src; int canwait; { struct ip6_pktopts *dst; if (src == NULL) { printf("ip6_clearpktopts: invalid argument\n"); return(NULL); } dst = malloc(sizeof(*dst), M_IP6OPT, canwait); if (dst == NULL && canwait == M_NOWAIT) goto bad; bzero(dst, sizeof(*dst)); dst->ip6po_hlim = src->ip6po_hlim; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, canwait); if (dst->ip6po_pktinfo == NULL && canwait == M_NOWAIT) goto bad; *dst->ip6po_pktinfo = *src->ip6po_pktinfo; } if (src->ip6po_nexthop) { dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len, M_IP6OPT, canwait); if (dst->ip6po_nexthop == NULL && canwait == M_NOWAIT) goto bad; bcopy(src->ip6po_nexthop, dst->ip6po_nexthop, src->ip6po_nexthop->sa_len); } PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ return(dst); bad: printf("ip6_copypktopts: copy failed"); if (dst->ip6po_pktinfo) free(dst->ip6po_pktinfo, M_IP6OPT); if (dst->ip6po_nexthop) free(dst->ip6po_nexthop, M_IP6OPT); if (dst->ip6po_hbh) free(dst->ip6po_hbh, M_IP6OPT); if (dst->ip6po_dest1) free(dst->ip6po_dest1, M_IP6OPT); if (dst->ip6po_dest2) free(dst->ip6po_dest2, M_IP6OPT); if (dst->ip6po_rthdr) free(dst->ip6po_rthdr, M_IP6OPT); return(NULL); } #undef PKTOPT_EXTHDRCPY void ip6_freepcbopts(pktopt) struct ip6_pktopts *pktopt; { if (pktopt == NULL) return; ip6_clearpktopts(pktopt, 1, -1); free(pktopt, M_IP6OPT); } /* * Set the IP6 multicast options in response to user setsockopt(). */ static int ip6_setmoptions(optname, im6op, m) int optname; struct ip6_moptions **im6op; struct mbuf *m; { int error = 0; u_int loop, ifindex; struct ipv6_mreq *mreq; struct ifnet *ifp; struct ip6_moptions *im6o = *im6op; struct route_in6 ro; struct sockaddr_in6 *dst; struct in6_multi_mship *imm; struct thread *td = curthread; /* XXX */ if (im6o == NULL) { /* * No multicast option buffer attached to the pcb; * allocate one and initialize to default values. */ im6o = (struct ip6_moptions *) malloc(sizeof(*im6o), M_IPMOPTS, M_WAITOK); if (im6o == NULL) return(ENOBUFS); *im6op = im6o; im6o->im6o_multicast_ifp = NULL; im6o->im6o_multicast_hlim = ip6_defmcasthlim; im6o->im6o_multicast_loop = IPV6_DEFAULT_MULTICAST_LOOP; LIST_INIT(&im6o->im6o_memberships); } switch (optname) { case IPV6_MULTICAST_IF: /* * Select the interface for outgoing multicast packets. */ if (m == NULL || m->m_len != sizeof(u_int)) { error = EINVAL; break; } bcopy(mtod(m, u_int *), &ifindex, sizeof(ifindex)); if (ifindex < 0 || if_index < ifindex) { error = ENXIO; /* XXX EINVAL? */ break; } ifp = ifnet_byindex(ifindex); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; break; } im6o->im6o_multicast_ifp = ifp; break; case IPV6_MULTICAST_HOPS: { /* * Set the IP6 hoplimit for outgoing multicast packets. */ int optval; if (m == NULL || m->m_len != sizeof(int)) { error = EINVAL; break; } bcopy(mtod(m, u_int *), &optval, sizeof(optval)); if (optval < -1 || optval >= 256) error = EINVAL; else if (optval == -1) im6o->im6o_multicast_hlim = ip6_defmcasthlim; else im6o->im6o_multicast_hlim = optval; break; } case IPV6_MULTICAST_LOOP: /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. */ if (m == NULL || m->m_len != sizeof(u_int)) { error = EINVAL; break; } bcopy(mtod(m, u_int *), &loop, sizeof(loop)); if (loop > 1) { error = EINVAL; break; } im6o->im6o_multicast_loop = loop; break; case IPV6_JOIN_GROUP: /* * Add a multicast group membership. * Group must be a valid IP6 multicast address. */ if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) { error = EINVAL; break; } mreq = mtod(m, struct ipv6_mreq *); if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) { /* * We use the unspecified address to specify to accept * all multicast addresses. Only super user is allowed * to do this. */ if (suser(td)) { error = EACCES; break; } } else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) { error = EINVAL; break; } /* * If the interface is specified, validate it. */ if (mreq->ipv6mr_interface < 0 || if_index < mreq->ipv6mr_interface) { error = ENXIO; /* XXX EINVAL? */ break; } /* * If no interface was explicitly specified, choose an * appropriate one according to the given multicast address. */ if (mreq->ipv6mr_interface == 0) { /* * If the multicast address is in node-local scope, * the interface should be a loopback interface. * Otherwise, look up the routing table for the * address, and choose the outgoing interface. * XXX: is it a good approach? */ if (IN6_IS_ADDR_MC_NODELOCAL(&mreq->ipv6mr_multiaddr)) { ifp = &loif[0]; } else { ro.ro_rt = NULL; dst = (struct sockaddr_in6 *)&ro.ro_dst; bzero(dst, sizeof(*dst)); dst->sin6_len = sizeof(struct sockaddr_in6); dst->sin6_family = AF_INET6; dst->sin6_addr = mreq->ipv6mr_multiaddr; rtalloc((struct route *)&ro); if (ro.ro_rt == NULL) { error = EADDRNOTAVAIL; break; } ifp = ro.ro_rt->rt_ifp; rtfree(ro.ro_rt); } } else ifp = ifnet_byindex(mreq->ipv6mr_interface); /* * See if we found an interface, and confirm that it * supports multicast */ if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; break; } /* * Put interface index into the multicast address, * if the address has link-local scope. */ if (IN6_IS_ADDR_MC_LINKLOCAL(&mreq->ipv6mr_multiaddr)) { mreq->ipv6mr_multiaddr.s6_addr16[1] = htons(mreq->ipv6mr_interface); } /* * See if the membership already exists. */ for (imm = im6o->im6o_memberships.lh_first; imm != NULL; imm = imm->i6mm_chain.le_next) if (imm->i6mm_maddr->in6m_ifp == ifp && IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, &mreq->ipv6mr_multiaddr)) break; if (imm != NULL) { error = EADDRINUSE; break; } /* * Everything looks good; add a new record to the multicast * address list for the given interface. */ imm = malloc(sizeof(*imm), M_IPMADDR, M_WAITOK); if (imm == NULL) { error = ENOBUFS; break; } if ((imm->i6mm_maddr = in6_addmulti(&mreq->ipv6mr_multiaddr, ifp, &error)) == NULL) { free(imm, M_IPMADDR); break; } LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain); break; case IPV6_LEAVE_GROUP: /* * Drop a multicast group membership. * Group must be a valid IP6 multicast address. */ if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) { error = EINVAL; break; } mreq = mtod(m, struct ipv6_mreq *); if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) { if (suser(td)) { error = EACCES; break; } } else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) { error = EINVAL; break; } /* * If an interface address was specified, get a pointer * to its ifnet structure. */ if (mreq->ipv6mr_interface < 0 || if_index < mreq->ipv6mr_interface) { error = ENXIO; /* XXX EINVAL? */ break; } ifp = ifnet_byindex(mreq->ipv6mr_interface); /* * Put interface index into the multicast address, * if the address has link-local scope. */ if (IN6_IS_ADDR_MC_LINKLOCAL(&mreq->ipv6mr_multiaddr)) { mreq->ipv6mr_multiaddr.s6_addr16[1] = htons(mreq->ipv6mr_interface); } /* * Find the membership in the membership list. */ for (imm = im6o->im6o_memberships.lh_first; imm != NULL; imm = imm->i6mm_chain.le_next) { if ((ifp == NULL || imm->i6mm_maddr->in6m_ifp == ifp) && IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, &mreq->ipv6mr_multiaddr)) break; } if (imm == NULL) { /* Unable to resolve interface */ error = EADDRNOTAVAIL; break; } /* * Give up the multicast address record to which the * membership points. */ LIST_REMOVE(imm, i6mm_chain); in6_delmulti(imm->i6mm_maddr); free(imm, M_IPMADDR); break; default: error = EOPNOTSUPP; break; } /* * If all options have default values, no need to keep the mbuf. */ if (im6o->im6o_multicast_ifp == NULL && im6o->im6o_multicast_hlim == ip6_defmcasthlim && im6o->im6o_multicast_loop == IPV6_DEFAULT_MULTICAST_LOOP && im6o->im6o_memberships.lh_first == NULL) { free(*im6op, M_IPMOPTS); *im6op = NULL; } return(error); } /* * Return the IP6 multicast options in response to user getsockopt(). */ static int ip6_getmoptions(optname, im6o, mp) int optname; struct ip6_moptions *im6o; struct mbuf **mp; { u_int *hlim, *loop, *ifindex; *mp = m_get(M_TRYWAIT, MT_HEADER); /* XXX */ switch (optname) { case IPV6_MULTICAST_IF: ifindex = mtod(*mp, u_int *); (*mp)->m_len = sizeof(u_int); if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) *ifindex = 0; else *ifindex = im6o->im6o_multicast_ifp->if_index; return(0); case IPV6_MULTICAST_HOPS: hlim = mtod(*mp, u_int *); (*mp)->m_len = sizeof(u_int); if (im6o == NULL) *hlim = ip6_defmcasthlim; else *hlim = im6o->im6o_multicast_hlim; return(0); case IPV6_MULTICAST_LOOP: loop = mtod(*mp, u_int *); (*mp)->m_len = sizeof(u_int); if (im6o == NULL) *loop = ip6_defmcasthlim; else *loop = im6o->im6o_multicast_loop; return(0); default: return(EOPNOTSUPP); } } /* * Discard the IP6 multicast options. */ void ip6_freemoptions(im6o) struct ip6_moptions *im6o; { struct in6_multi_mship *imm; if (im6o == NULL) return; while ((imm = im6o->im6o_memberships.lh_first) != NULL) { LIST_REMOVE(imm, i6mm_chain); if (imm->i6mm_maddr) in6_delmulti(imm->i6mm_maddr); free(imm, M_IPMADDR); } free(im6o, M_IPMOPTS); } /* * Set IPv6 outgoing packet options based on advanced API. */ int ip6_setpktoptions(control, opt, priv, needcopy) struct mbuf *control; struct ip6_pktopts *opt; int priv, needcopy; { struct cmsghdr *cm = 0; if (control == 0 || opt == 0) return(EINVAL); init_ip6pktopts(opt); /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) return(EINVAL); for (; control->m_len; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { cm = mtod(control, struct cmsghdr *); if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) return(EINVAL); if (cm->cmsg_level != IPPROTO_IPV6) continue; /* * XXX should check if RFC2292 API is mixed with 2292bis API */ switch (cm->cmsg_type) { case IPV6_PKTINFO: if (cm->cmsg_len != CMSG_LEN(sizeof(struct in6_pktinfo))) return(EINVAL); if (needcopy) { /* XXX: Is it really WAITOK? */ opt->ip6po_pktinfo = malloc(sizeof(struct in6_pktinfo), M_IP6OPT, M_WAITOK); bcopy(CMSG_DATA(cm), opt->ip6po_pktinfo, sizeof(struct in6_pktinfo)); } else opt->ip6po_pktinfo = (struct in6_pktinfo *)CMSG_DATA(cm); if (opt->ip6po_pktinfo->ipi6_ifindex && IN6_IS_ADDR_LINKLOCAL(&opt->ip6po_pktinfo->ipi6_addr)) opt->ip6po_pktinfo->ipi6_addr.s6_addr16[1] = htons(opt->ip6po_pktinfo->ipi6_ifindex); if (opt->ip6po_pktinfo->ipi6_ifindex > if_index || opt->ip6po_pktinfo->ipi6_ifindex < 0) { return(ENXIO); } /* * Check if the requested source address is indeed a * unicast address assigned to the node, and can be * used as the packet's source address. */ if (!IN6_IS_ADDR_UNSPECIFIED(&opt->ip6po_pktinfo->ipi6_addr)) { struct in6_ifaddr *ia6; struct sockaddr_in6 sin6; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(sin6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = opt->ip6po_pktinfo->ipi6_addr; ia6 = (struct in6_ifaddr *)ifa_ifwithaddr(sin6tosa(&sin6)); if (ia6 == NULL || (ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY)) != 0) return(EADDRNOTAVAIL); } break; case IPV6_HOPLIMIT: if (cm->cmsg_len != CMSG_LEN(sizeof(int))) return(EINVAL); opt->ip6po_hlim = *(int *)CMSG_DATA(cm); if (opt->ip6po_hlim < -1 || opt->ip6po_hlim > 255) return(EINVAL); break; case IPV6_NEXTHOP: if (!priv) return(EPERM); if (cm->cmsg_len < sizeof(u_char) || /* check if cmsg_len is large enough for sa_len */ cm->cmsg_len < CMSG_LEN(*CMSG_DATA(cm))) return(EINVAL); if (needcopy) { opt->ip6po_nexthop = malloc(*CMSG_DATA(cm), M_IP6OPT, M_WAITOK); bcopy(CMSG_DATA(cm), opt->ip6po_nexthop, *CMSG_DATA(cm)); } else opt->ip6po_nexthop = (struct sockaddr *)CMSG_DATA(cm); break; case IPV6_HOPOPTS: { struct ip6_hbh *hbh; int hbhlen; if (cm->cmsg_len < CMSG_LEN(sizeof(struct ip6_hbh))) return(EINVAL); hbh = (struct ip6_hbh *)CMSG_DATA(cm); hbhlen = (hbh->ip6h_len + 1) << 3; if (cm->cmsg_len != CMSG_LEN(hbhlen)) return(EINVAL); if (needcopy) { opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_WAITOK); bcopy(hbh, opt->ip6po_hbh, hbhlen); } else opt->ip6po_hbh = hbh; break; } case IPV6_DSTOPTS: { struct ip6_dest *dest, **newdest; int destlen; if (cm->cmsg_len < CMSG_LEN(sizeof(struct ip6_dest))) return(EINVAL); dest = (struct ip6_dest *)CMSG_DATA(cm); destlen = (dest->ip6d_len + 1) << 3; if (cm->cmsg_len != CMSG_LEN(destlen)) return(EINVAL); /* * The old advacned API is ambiguous on this * point. Our approach is to determine the * position based according to the existence * of a routing header. Note, however, that * this depends on the order of the extension * headers in the ancillary data; the 1st part * of the destination options header must * appear before the routing header in the * ancillary data, too. * RFC2292bis solved the ambiguity by * introducing separate cmsg types. */ if (opt->ip6po_rthdr == NULL) newdest = &opt->ip6po_dest1; else newdest = &opt->ip6po_dest2; if (needcopy) { *newdest = malloc(destlen, M_IP6OPT, M_WAITOK); bcopy(dest, *newdest, destlen); } else *newdest = dest; break; } case IPV6_RTHDR: { struct ip6_rthdr *rth; int rthlen; if (cm->cmsg_len < CMSG_LEN(sizeof(struct ip6_rthdr))) return(EINVAL); rth = (struct ip6_rthdr *)CMSG_DATA(cm); rthlen = (rth->ip6r_len + 1) << 3; if (cm->cmsg_len != CMSG_LEN(rthlen)) return(EINVAL); switch (rth->ip6r_type) { case IPV6_RTHDR_TYPE_0: /* must contain one addr */ if (rth->ip6r_len == 0) return(EINVAL); /* length must be even */ if (rth->ip6r_len % 2) return(EINVAL); if (rth->ip6r_len / 2 != rth->ip6r_segleft) return(EINVAL); break; default: return(EINVAL); /* not supported */ } if (needcopy) { opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_WAITOK); bcopy(rth, opt->ip6po_rthdr, rthlen); } else opt->ip6po_rthdr = rth; break; } default: return(ENOPROTOOPT); } } return(0); } /* * Routine called from ip6_output() to loop back a copy of an IP6 multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be &loif -- easier than replicating that code here. */ void ip6_mloopback(ifp, m, dst) struct ifnet *ifp; struct mbuf *m; struct sockaddr_in6 *dst; { struct mbuf *copym; struct ip6_hdr *ip6; copym = m_copy(m, 0, M_COPYALL); if (copym == NULL) return; /* * Make sure to deep-copy IPv6 header portion in case the data * is in an mbuf cluster, so that we can safely override the IPv6 * header portion later. */ if ((copym->m_flags & M_EXT) != 0 || copym->m_len < sizeof(struct ip6_hdr)) { copym = m_pullup(copym, sizeof(struct ip6_hdr)); if (copym == NULL) return; } #ifdef DIAGNOSTIC if (copym->m_len < sizeof(*ip6)) { m_freem(copym); return; } #endif ip6 = mtod(copym, struct ip6_hdr *); #ifndef SCOPEDROUTING /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); #endif (void)if_simloop(ifp, copym, dst->sin6_family, 0); } /* * Chop IPv6 header off from the payload. */ static int ip6_splithdr(m, exthdrs) struct mbuf *m; struct ip6_exthdrs *exthdrs; { struct mbuf *mh; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); if (m->m_len > sizeof(*ip6)) { MGETHDR(mh, M_DONTWAIT, MT_HEADER); if (mh == 0) { m_freem(m); return ENOBUFS; } M_COPY_PKTHDR(mh, m); MH_ALIGN(mh, sizeof(*ip6)); m->m_flags &= ~M_PKTHDR; m->m_len -= sizeof(*ip6); m->m_data += sizeof(*ip6); mh->m_next = m; m = mh; m->m_len = sizeof(*ip6); bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6)); } exthdrs->ip6e_ip6 = m; return 0; } /* * Compute IPv6 extension header length. */ int ip6_optlen(in6p) struct in6pcb *in6p; { int len; if (!in6p->in6p_outputopts) return 0; len = 0; #define elen(x) \ (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) len += elen(in6p->in6p_outputopts->ip6po_hbh); if (in6p->in6p_outputopts->ip6po_rthdr) /* dest1 is valid with rthdr only */ len += elen(in6p->in6p_outputopts->ip6po_dest1); len += elen(in6p->in6p_outputopts->ip6po_rthdr); len += elen(in6p->in6p_outputopts->ip6po_dest2); return len; #undef elen } Index: head/sys/netinet6/ip6_var.h =================================================================== --- head/sys/netinet6/ip6_var.h (revision 105193) +++ head/sys/netinet6/ip6_var.h (revision 105194) @@ -1,353 +1,354 @@ /* $FreeBSD$ */ /* $KAME: ip6_var.h,v 1.62 2001/05/03 14:51:48 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_var.h 8.1 (Berkeley) 6/10/93 */ #ifndef _NETINET6_IP6_VAR_H_ #define _NETINET6_IP6_VAR_H_ /* * IP6 reassembly queue structure. Each fragment * being reassembled is attached to one of these structures. */ struct ip6q { u_int32_t ip6q_head; u_int16_t ip6q_len; u_int8_t ip6q_nxt; /* ip6f_nxt in first fragment */ u_int8_t ip6q_hlim; struct ip6asfrag *ip6q_down; struct ip6asfrag *ip6q_up; u_int32_t ip6q_ident; u_int8_t ip6q_arrive; u_int8_t ip6q_ttl; struct in6_addr ip6q_src, ip6q_dst; struct ip6q *ip6q_next; struct ip6q *ip6q_prev; int ip6q_unfrglen; /* len of unfragmentable part */ #ifdef notyet u_char *ip6q_nxtp; #endif }; struct ip6asfrag { u_int32_t ip6af_head; u_int16_t ip6af_len; u_int8_t ip6af_nxt; u_int8_t ip6af_hlim; /* must not override the above members during reassembling */ struct ip6asfrag *ip6af_down; struct ip6asfrag *ip6af_up; struct mbuf *ip6af_m; int ip6af_offset; /* offset in ip6af_m to next header */ int ip6af_frglen; /* fragmentable part length */ int ip6af_off; /* fragment offset */ u_int16_t ip6af_mff; /* more fragment bit in frag off */ }; #define IP6_REASS_MBUF(ip6af) (*(struct mbuf **)&((ip6af)->ip6af_m)) struct ip6_moptions { struct ifnet *im6o_multicast_ifp; /* ifp for outgoing multicasts */ u_char im6o_multicast_hlim; /* hoplimit for outgoing multicasts */ u_char im6o_multicast_loop; /* 1 >= hear sends if a member */ LIST_HEAD(, in6_multi_mship) im6o_memberships; }; /* * Control options for outgoing packets */ /* Routing header related info */ struct ip6po_rhinfo { struct ip6_rthdr *ip6po_rhi_rthdr; /* Routing header */ struct route_in6 ip6po_rhi_route; /* Route to the 1st hop */ }; #define ip6po_rthdr ip6po_rhinfo.ip6po_rhi_rthdr #define ip6po_route ip6po_rhinfo.ip6po_rhi_route struct ip6_pktopts { struct mbuf *ip6po_m; /* Pointer to mbuf storing the data */ int ip6po_hlim; /* Hoplimit for outgoing packets */ /* Outgoing IF/address information */ struct in6_pktinfo *ip6po_pktinfo; struct sockaddr *ip6po_nexthop; /* Next-hop address */ struct ip6_hbh *ip6po_hbh; /* Hop-by-Hop options header */ /* Destination options header (before a routing header) */ struct ip6_dest *ip6po_dest1; /* Routing header related info. */ struct ip6po_rhinfo ip6po_rhinfo; /* Destination options header (after a routing header) */ struct ip6_dest *ip6po_dest2; }; /* * Control options for incoming packets */ struct ip6stat { u_quad_t ip6s_total; /* total packets received */ u_quad_t ip6s_tooshort; /* packet too short */ u_quad_t ip6s_toosmall; /* not enough data */ u_quad_t ip6s_fragments; /* fragments received */ u_quad_t ip6s_fragdropped; /* frags dropped(dups, out of space) */ u_quad_t ip6s_fragtimeout; /* fragments timed out */ u_quad_t ip6s_fragoverflow; /* fragments that exceeded limit */ u_quad_t ip6s_forward; /* packets forwarded */ u_quad_t ip6s_cantforward; /* packets rcvd for unreachable dest */ u_quad_t ip6s_redirectsent; /* packets forwarded on same net */ u_quad_t ip6s_delivered; /* datagrams delivered to upper level*/ u_quad_t ip6s_localout; /* total ip packets generated here */ u_quad_t ip6s_odropped; /* lost packets due to nobufs, etc. */ u_quad_t ip6s_reassembled; /* total packets reassembled ok */ u_quad_t ip6s_fragmented; /* datagrams sucessfully fragmented */ u_quad_t ip6s_ofragments; /* output fragments created */ u_quad_t ip6s_cantfrag; /* don't fragment flag was set, etc. */ u_quad_t ip6s_badoptions; /* error in option processing */ u_quad_t ip6s_noroute; /* packets discarded due to no route */ u_quad_t ip6s_badvers; /* ip6 version != 6 */ u_quad_t ip6s_rawout; /* total raw ip packets generated */ u_quad_t ip6s_badscope; /* scope error */ u_quad_t ip6s_notmember; /* don't join this multicast group */ u_quad_t ip6s_nxthist[256]; /* next header history */ u_quad_t ip6s_m1; /* one mbuf */ u_quad_t ip6s_m2m[32]; /* two or more mbuf */ u_quad_t ip6s_mext1; /* one ext mbuf */ u_quad_t ip6s_mext2m; /* two or more ext mbuf */ u_quad_t ip6s_exthdrtoolong; /* ext hdr are not continuous */ u_quad_t ip6s_nogif; /* no match gif found */ u_quad_t ip6s_toomanyhdr; /* discarded due to too many headers */ /* * statistics for improvement of the source address selection * algorithm: * XXX: hardcoded 16 = # of ip6 multicast scope types + 1 */ /* number of times that address selection fails */ u_quad_t ip6s_sources_none; /* number of times that an address on the outgoing I/F is chosen */ u_quad_t ip6s_sources_sameif[16]; /* number of times that an address on a non-outgoing I/F is chosen */ u_quad_t ip6s_sources_otherif[16]; /* * number of times that an address that has the same scope * from the destination is chosen. */ u_quad_t ip6s_sources_samescope[16]; /* * number of times that an address that has a different scope * from the destination is chosen. */ u_quad_t ip6s_sources_otherscope[16]; /* number of times that an deprecated address is chosen */ u_quad_t ip6s_sources_deprecated[16]; u_quad_t ip6s_forward_cachehit; u_quad_t ip6s_forward_cachemiss; }; #ifdef _KERNEL /* * IPv6 onion peeling state. * it will be initialized when we come into ip6_input(). * XXX do not make it a kitchen sink! */ struct ip6aux { u_int32_t ip6a_flags; #define IP6A_SWAP 0x01 /* swapped home/care-of on packet */ #define IP6A_HASEEN 0x02 /* HA was present */ #define IP6A_BRUID 0x04 /* BR Unique Identifier was present */ #define IP6A_RTALERTSEEN 0x08 /* rtalert present */ /* ip6.ip6_src */ struct in6_addr ip6a_careof; /* care-of address of the peer */ struct in6_addr ip6a_home; /* home address of the peer */ u_int16_t ip6a_bruid; /* BR unique identifier */ /* ip6.ip6_dst */ struct in6_ifaddr *ip6a_dstia6; /* my ifaddr that matches ip6_dst */ /* rtalert */ u_int16_t ip6a_rtalert; /* rtalert option value */ /* * decapsulation history will be here. * with IPsec it may not be accurate. */ }; #endif #ifdef _KERNEL /* flags passed to ip6_output as last parameter */ #define IPV6_DADOUTPUT 0x01 /* DAD */ #define IPV6_FORWARDING 0x02 /* most of IPv6 header exists */ #define IPV6_MINMTU 0x04 /* use minimum MTU (IPV6_USE_MIN_MTU) */ extern struct ip6stat ip6stat; /* statistics */ extern u_int32_t ip6_id; /* fragment identifier */ extern int ip6_defhlim; /* default hop limit */ extern int ip6_defmcasthlim; /* default multicast hop limit */ extern int ip6_forwarding; /* act as router? */ extern int ip6_forward_srcrt; /* forward src-routed? */ extern int ip6_gif_hlim; /* Hop limit for gif encap packet */ extern int ip6_use_deprecated; /* allow deprecated addr as source */ extern int ip6_rr_prune; /* router renumbering prefix * walk list every 5 sec. */ extern int ip6_v6only; extern struct socket *ip6_mrouter; /* multicast routing daemon */ extern int ip6_sendredirects; /* send IP redirects when forwarding? */ extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */ extern int ip6_sourcecheck; /* Verify source interface */ extern int ip6_sourcecheck_interval; /* Interval between log messages */ extern int ip6_accept_rtadv; /* Acts as a host not a router */ extern int ip6_keepfaith; /* Firewall Aided Internet Translator */ extern int ip6_log_interval; extern time_t ip6_log_time; extern int ip6_hdrnestlimit; /* upper limit of # of extension headers */ extern int ip6_dad_count; /* DupAddrDetectionTransmits */ extern u_int32_t ip6_flow_seq; extern int ip6_auto_flowlabel; extern int ip6_auto_linklocal; extern int ip6_anonportmin; /* minimum ephemeral port */ extern int ip6_anonportmax; /* maximum ephemeral port */ extern int ip6_lowportmin; /* minimum reserved port */ extern int ip6_lowportmax; /* maximum reserved port */ extern int ip6_use_tempaddr; /* whether to use temporary addresses. */ extern struct pr_usrreqs rip6_usrreqs; struct sockopt; struct inpcb; int icmp6_ctloutput __P((struct socket *, struct sockopt *sopt)); struct in6_ifaddr; void ip6_init __P((void)); void ip6intr __P((void)); void ip6_input __P((struct mbuf *)); struct in6_ifaddr *ip6_getdstifaddr __P((struct mbuf *)); void ip6_freepcbopts __P((struct ip6_pktopts *)); void ip6_freemoptions __P((struct ip6_moptions *)); int ip6_unknown_opt __P((u_int8_t *, struct mbuf *, int)); char * ip6_get_prevhdr __P((struct mbuf *, int)); int ip6_nexthdr __P((struct mbuf *, int, int, int *)); int ip6_lasthdr __P((struct mbuf *, int, int, int *)); -struct mbuf *ip6_addaux __P((struct mbuf *)); -struct mbuf *ip6_findaux __P((struct mbuf *)); +struct ip6aux *ip6_addaux __P((struct mbuf *)); +struct ip6aux *ip6_findaux __P((struct mbuf *)); void ip6_delaux __P((struct mbuf *)); int ip6_mforward __P((struct ip6_hdr *, struct ifnet *, struct mbuf *)); int ip6_process_hopopts __P((struct mbuf *, u_int8_t *, int, u_int32_t *, u_int32_t *)); void ip6_savecontrol __P((struct inpcb *, struct mbuf **, struct ip6_hdr *, struct mbuf *)); void ip6_notify_pmtu __P((struct inpcb *, struct sockaddr_in6 *, u_int32_t *)); int ip6_sysctl __P((int *, u_int, void *, size_t *, void *, size_t)); void ip6_forward __P((struct mbuf *, int)); void ip6_mloopback __P((struct ifnet *, struct mbuf *, struct sockaddr_in6 *)); int ip6_output __P((struct mbuf *, struct ip6_pktopts *, struct route_in6 *, int, - struct ip6_moptions *, struct ifnet **)); + struct ip6_moptions *, struct ifnet **, + struct inpcb *)); int ip6_ctloutput __P((struct socket *, struct sockopt *sopt)); void init_ip6pktopts __P((struct ip6_pktopts *)); int ip6_setpktoptions __P((struct mbuf *, struct ip6_pktopts *, int, int)); void ip6_clearpktopts __P((struct ip6_pktopts *, int, int)); struct ip6_pktopts *ip6_copypktopts __P((struct ip6_pktopts *, int)); int ip6_optlen __P((struct inpcb *)); int route6_input __P((struct mbuf **, int *, int)); void frag6_init __P((void)); int frag6_input __P((struct mbuf **, int *, int)); void frag6_slowtimo __P((void)); void frag6_drain __P((void)); void rip6_init __P((void)); int rip6_input __P((struct mbuf **mp, int *offp, int proto)); void rip6_ctlinput __P((int, struct sockaddr *, void *)); int rip6_ctloutput __P((struct socket *so, struct sockopt *sopt)); int rip6_output __P((struct mbuf *, ...)); int rip6_usrreq __P((struct socket *, int, struct mbuf *, struct mbuf *, struct mbuf *, struct thread *)); int dest6_input __P((struct mbuf **, int *, int)); int none_input __P((struct mbuf **, int *, int)); #endif /* _KERNEL */ #endif /* !_NETINET6_IP6_VAR_H_ */ Index: head/sys/netinet6/ipsec.c =================================================================== --- head/sys/netinet6/ipsec.c (revision 105193) +++ head/sys/netinet6/ipsec.c (revision 105194) @@ -1,3602 +1,3502 @@ /* $FreeBSD$ */ /* $KAME: ipsec.c,v 1.103 2001/05/24 07:14:18 sakane Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPsec controller part. */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #ifdef IPSEC_ESP #include #ifdef INET6 #include #endif #endif #include #ifdef INET6 #include #endif #include #include #include #include #include #ifdef IPSEC_DEBUG int ipsec_debug = 1; #else int ipsec_debug = 0; #endif struct ipsecstat ipsecstat; int ip4_ah_cleartos = 1; int ip4_ah_offsetmask = 0; /* maybe IP_DF? */ int ip4_ipsec_dfbit = 0; /* DF bit on encap. 0: clear 1: set 2: copy */ int ip4_esp_trans_deflev = IPSEC_LEVEL_USE; int ip4_esp_net_deflev = IPSEC_LEVEL_USE; int ip4_ah_trans_deflev = IPSEC_LEVEL_USE; int ip4_ah_net_deflev = IPSEC_LEVEL_USE; struct secpolicy ip4_def_policy; int ip4_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ int ip4_esp_randpad = -1; #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_ipsec); #ifdef INET6 SYSCTL_DECL(_net_inet6_ipsec6); #endif #endif /* net.inet.ipsec */ SYSCTL_STRUCT(_net_inet_ipsec, IPSECCTL_STATS, stats, CTLFLAG_RD, &ipsecstat, ipsecstat, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_POLICY, def_policy, CTLFLAG_RW, &ip4_def_policy.policy, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev, CTLFLAG_RW, &ip4_esp_trans_deflev, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev, CTLFLAG_RW, &ip4_esp_net_deflev, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev, CTLFLAG_RW, &ip4_ah_trans_deflev, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev, CTLFLAG_RW, &ip4_ah_net_deflev, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_CLEARTOS, ah_cleartos, CTLFLAG_RW, &ip4_ah_cleartos, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_OFFSETMASK, ah_offsetmask, CTLFLAG_RW, &ip4_ah_offsetmask, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DFBIT, dfbit, CTLFLAG_RW, &ip4_ipsec_dfbit, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_ECN, ecn, CTLFLAG_RW, &ip4_ipsec_ecn, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEBUG, debug, CTLFLAG_RW, &ipsec_debug, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_ESP_RANDPAD, esp_randpad, CTLFLAG_RW, &ip4_esp_randpad, 0, ""); #ifdef INET6 struct ipsecstat ipsec6stat; int ip6_esp_trans_deflev = IPSEC_LEVEL_USE; int ip6_esp_net_deflev = IPSEC_LEVEL_USE; int ip6_ah_trans_deflev = IPSEC_LEVEL_USE; int ip6_ah_net_deflev = IPSEC_LEVEL_USE; struct secpolicy ip6_def_policy; int ip6_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ int ip6_esp_randpad = -1; /* net.inet6.ipsec6 */ SYSCTL_STRUCT(_net_inet6_ipsec6, IPSECCTL_STATS, stats, CTLFLAG_RD, &ipsec6stat, ipsecstat, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_POLICY, def_policy, CTLFLAG_RW, &ip6_def_policy.policy, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev, CTLFLAG_RW, &ip6_esp_trans_deflev, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev, CTLFLAG_RW, &ip6_esp_net_deflev, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev, CTLFLAG_RW, &ip6_ah_trans_deflev, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev, CTLFLAG_RW, &ip6_ah_net_deflev, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ECN, ecn, CTLFLAG_RW, &ip6_ipsec_ecn, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEBUG, debug, CTLFLAG_RW, &ipsec_debug, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ESP_RANDPAD, esp_randpad, CTLFLAG_RW, &ip6_esp_randpad, 0, ""); #endif /* INET6 */ static int ipsec_setspidx_mbuf __P((struct secpolicyindex *, u_int, u_int, struct mbuf *, int)); static int ipsec4_setspidx_inpcb __P((struct mbuf *, struct inpcb *pcb)); #ifdef INET6 static int ipsec6_setspidx_in6pcb __P((struct mbuf *, struct in6pcb *pcb)); #endif static int ipsec_setspidx __P((struct mbuf *, struct secpolicyindex *, int)); static void ipsec4_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int)); static int ipsec4_setspidx_ipaddr __P((struct mbuf *, struct secpolicyindex *)); #ifdef INET6 static void ipsec6_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int)); static int ipsec6_setspidx_ipaddr __P((struct mbuf *, struct secpolicyindex *)); #endif static struct inpcbpolicy *ipsec_newpcbpolicy __P((void)); static void ipsec_delpcbpolicy __P((struct inpcbpolicy *)); static struct secpolicy *ipsec_deepcopy_policy __P((struct secpolicy *src)); static int ipsec_set_policy __P((struct secpolicy **pcb_sp, int optname, caddr_t request, size_t len, int priv)); static int ipsec_get_policy __P((struct secpolicy *pcb_sp, struct mbuf **mp)); static void vshiftl __P((unsigned char *, int, int)); static int ipsec_in_reject __P((struct secpolicy *, struct mbuf *)); static size_t ipsec_hdrsiz __P((struct secpolicy *)); #ifdef INET static struct mbuf *ipsec4_splithdr __P((struct mbuf *)); #endif #ifdef INET6 static struct mbuf *ipsec6_splithdr __P((struct mbuf *)); #endif #ifdef INET static int ipsec4_encapsulate __P((struct mbuf *, struct secasvar *)); #endif #ifdef INET6 static int ipsec6_encapsulate __P((struct mbuf *, struct secasvar *)); #endif -static struct mbuf *ipsec_addaux __P((struct mbuf *)); -static struct mbuf *ipsec_findaux __P((struct mbuf *)); -static void ipsec_optaux __P((struct mbuf *, struct mbuf *)); /* * For OUTBOUND packet having a socket. Searching SPD for packet, * and return a pointer to SP. * OUT: NULL: no apropreate SP found, the following value is set to error. * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. * others : error occured. * others: a pointer to SP * * NOTE: IPv6 mapped adddress concern is implemented here. */ struct secpolicy * ipsec4_getpolicybysock(m, dir, so, error) struct mbuf *m; u_int dir; struct socket *so; int *error; { struct inpcbpolicy *pcbsp = NULL; struct secpolicy *currsp = NULL; /* policy on socket */ struct secpolicy *kernsp = NULL; /* policy on kernel */ /* sanity check */ if (m == NULL || so == NULL || error == NULL) panic("ipsec4_getpolicybysock: NULL pointer was passed.\n"); switch (so->so_proto->pr_domain->dom_family) { case AF_INET: /* set spidx in pcb */ *error = ipsec4_setspidx_inpcb(m, sotoinpcb(so)); break; #ifdef INET6 case AF_INET6: /* set spidx in pcb */ *error = ipsec6_setspidx_in6pcb(m, sotoin6pcb(so)); break; #endif default: panic("ipsec4_getpolicybysock: unsupported address family\n"); } if (*error) return NULL; switch (so->so_proto->pr_domain->dom_family) { case AF_INET: pcbsp = sotoinpcb(so)->inp_sp; break; #ifdef INET6 case AF_INET6: pcbsp = sotoin6pcb(so)->in6p_sp; break; #endif } /* sanity check */ if (pcbsp == NULL) panic("ipsec4_getpolicybysock: pcbsp is NULL.\n"); switch (dir) { case IPSEC_DIR_INBOUND: currsp = pcbsp->sp_in; break; case IPSEC_DIR_OUTBOUND: currsp = pcbsp->sp_out; break; default: panic("ipsec4_getpolicybysock: illegal direction.\n"); } /* sanity check */ if (currsp == NULL) panic("ipsec4_getpolicybysock: currsp is NULL.\n"); /* when privilieged socket */ if (pcbsp->priv) { switch (currsp->policy) { case IPSEC_POLICY_BYPASS: currsp->refcnt++; *error = 0; return currsp; case IPSEC_POLICY_ENTRUST: /* look for a policy in SPD */ kernsp = key_allocsp(&currsp->spidx, dir); /* SP found */ if (kernsp != NULL) { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP ipsec4_getpolicybysock called " "to allocate SP:%p\n", kernsp)); *error = 0; return kernsp; } /* no SP found */ if (ip4_def_policy.policy != IPSEC_POLICY_DISCARD && ip4_def_policy.policy != IPSEC_POLICY_NONE) { ipseclog((LOG_INFO, "fixed system default policy: %d->%d\n", ip4_def_policy.policy, IPSEC_POLICY_NONE)); ip4_def_policy.policy = IPSEC_POLICY_NONE; } ip4_def_policy.refcnt++; *error = 0; return &ip4_def_policy; case IPSEC_POLICY_IPSEC: currsp->refcnt++; *error = 0; return currsp; default: ipseclog((LOG_ERR, "ipsec4_getpolicybysock: " "Invalid policy for PCB %d\n", currsp->policy)); *error = EINVAL; return NULL; } /* NOTREACHED */ } /* when non-privilieged socket */ /* look for a policy in SPD */ kernsp = key_allocsp(&currsp->spidx, dir); /* SP found */ if (kernsp != NULL) { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP ipsec4_getpolicybysock called " "to allocate SP:%p\n", kernsp)); *error = 0; return kernsp; } /* no SP found */ switch (currsp->policy) { case IPSEC_POLICY_BYPASS: ipseclog((LOG_ERR, "ipsec4_getpolicybysock: " "Illegal policy for non-priviliged defined %d\n", currsp->policy)); *error = EINVAL; return NULL; case IPSEC_POLICY_ENTRUST: if (ip4_def_policy.policy != IPSEC_POLICY_DISCARD && ip4_def_policy.policy != IPSEC_POLICY_NONE) { ipseclog((LOG_INFO, "fixed system default policy: %d->%d\n", ip4_def_policy.policy, IPSEC_POLICY_NONE)); ip4_def_policy.policy = IPSEC_POLICY_NONE; } ip4_def_policy.refcnt++; *error = 0; return &ip4_def_policy; case IPSEC_POLICY_IPSEC: currsp->refcnt++; *error = 0; return currsp; default: ipseclog((LOG_ERR, "ipsec4_getpolicybysock: " "Invalid policy for PCB %d\n", currsp->policy)); *error = EINVAL; return NULL; } /* NOTREACHED */ } /* * For FORWADING packet or OUTBOUND without a socket. Searching SPD for packet, * and return a pointer to SP. * OUT: positive: a pointer to the entry for security policy leaf matched. * NULL: no apropreate SP found, the following value is set to error. * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. * others : error occured. */ struct secpolicy * ipsec4_getpolicybyaddr(m, dir, flag, error) struct mbuf *m; u_int dir; int flag; int *error; { struct secpolicy *sp = NULL; /* sanity check */ if (m == NULL || error == NULL) panic("ipsec4_getpolicybyaddr: NULL pointer was passed.\n"); { struct secpolicyindex spidx; bzero(&spidx, sizeof(spidx)); /* make a index to look for a policy */ *error = ipsec_setspidx_mbuf(&spidx, dir, AF_INET, m, (flag & IP_FORWARDING) ? 0 : 1); if (*error != 0) return NULL; sp = key_allocsp(&spidx, dir); } /* SP found */ if (sp != NULL) { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP ipsec4_getpolicybyaddr called " "to allocate SP:%p\n", sp)); *error = 0; return sp; } /* no SP found */ if (ip4_def_policy.policy != IPSEC_POLICY_DISCARD && ip4_def_policy.policy != IPSEC_POLICY_NONE) { ipseclog((LOG_INFO, "fixed system default policy:%d->%d\n", ip4_def_policy.policy, IPSEC_POLICY_NONE)); ip4_def_policy.policy = IPSEC_POLICY_NONE; } ip4_def_policy.refcnt++; *error = 0; return &ip4_def_policy; } #ifdef INET6 /* * For OUTBOUND packet having a socket. Searching SPD for packet, * and return a pointer to SP. * OUT: NULL: no apropreate SP found, the following value is set to error. * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. * others : error occured. * others: a pointer to SP */ struct secpolicy * ipsec6_getpolicybysock(m, dir, so, error) struct mbuf *m; u_int dir; struct socket *so; int *error; { struct inpcbpolicy *pcbsp = NULL; struct secpolicy *currsp = NULL; /* policy on socket */ struct secpolicy *kernsp = NULL; /* policy on kernel */ /* sanity check */ if (m == NULL || so == NULL || error == NULL) panic("ipsec6_getpolicybysock: NULL pointer was passed.\n"); #ifdef DIAGNOSTIC if (so->so_proto->pr_domain->dom_family != AF_INET6) panic("ipsec6_getpolicybysock: socket domain != inet6\n"); #endif /* set spidx in pcb */ ipsec6_setspidx_in6pcb(m, sotoin6pcb(so)); pcbsp = sotoin6pcb(so)->in6p_sp; /* sanity check */ if (pcbsp == NULL) panic("ipsec6_getpolicybysock: pcbsp is NULL.\n"); switch (dir) { case IPSEC_DIR_INBOUND: currsp = pcbsp->sp_in; break; case IPSEC_DIR_OUTBOUND: currsp = pcbsp->sp_out; break; default: panic("ipsec6_getpolicybysock: illegal direction.\n"); } /* sanity check */ if (currsp == NULL) panic("ipsec6_getpolicybysock: currsp is NULL.\n"); /* when privilieged socket */ if (pcbsp->priv) { switch (currsp->policy) { case IPSEC_POLICY_BYPASS: currsp->refcnt++; *error = 0; return currsp; case IPSEC_POLICY_ENTRUST: /* look for a policy in SPD */ kernsp = key_allocsp(&currsp->spidx, dir); /* SP found */ if (kernsp != NULL) { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP ipsec6_getpolicybysock called " "to allocate SP:%p\n", kernsp)); *error = 0; return kernsp; } /* no SP found */ if (ip6_def_policy.policy != IPSEC_POLICY_DISCARD && ip6_def_policy.policy != IPSEC_POLICY_NONE) { ipseclog((LOG_INFO, "fixed system default policy: %d->%d\n", ip6_def_policy.policy, IPSEC_POLICY_NONE)); ip6_def_policy.policy = IPSEC_POLICY_NONE; } ip6_def_policy.refcnt++; *error = 0; return &ip6_def_policy; case IPSEC_POLICY_IPSEC: currsp->refcnt++; *error = 0; return currsp; default: ipseclog((LOG_ERR, "ipsec6_getpolicybysock: " "Invalid policy for PCB %d\n", currsp->policy)); *error = EINVAL; return NULL; } /* NOTREACHED */ } /* when non-privilieged socket */ /* look for a policy in SPD */ kernsp = key_allocsp(&currsp->spidx, dir); /* SP found */ if (kernsp != NULL) { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP ipsec6_getpolicybysock called " "to allocate SP:%p\n", kernsp)); *error = 0; return kernsp; } /* no SP found */ switch (currsp->policy) { case IPSEC_POLICY_BYPASS: ipseclog((LOG_ERR, "ipsec6_getpolicybysock: " "Illegal policy for non-priviliged defined %d\n", currsp->policy)); *error = EINVAL; return NULL; case IPSEC_POLICY_ENTRUST: if (ip6_def_policy.policy != IPSEC_POLICY_DISCARD && ip6_def_policy.policy != IPSEC_POLICY_NONE) { ipseclog((LOG_INFO, "fixed system default policy: %d->%d\n", ip6_def_policy.policy, IPSEC_POLICY_NONE)); ip6_def_policy.policy = IPSEC_POLICY_NONE; } ip6_def_policy.refcnt++; *error = 0; return &ip6_def_policy; case IPSEC_POLICY_IPSEC: currsp->refcnt++; *error = 0; return currsp; default: ipseclog((LOG_ERR, "ipsec6_policybysock: Invalid policy for PCB %d\n", currsp->policy)); *error = EINVAL; return NULL; } /* NOTREACHED */ } /* * For FORWADING packet or OUTBOUND without a socket. Searching SPD for packet, * and return a pointer to SP. * `flag' means that packet is to be forwarded whether or not. * flag = 1: forwad * OUT: positive: a pointer to the entry for security policy leaf matched. * NULL: no apropreate SP found, the following value is set to error. * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. * others : error occured. */ #ifndef IP_FORWARDING #define IP_FORWARDING 1 #endif struct secpolicy * ipsec6_getpolicybyaddr(m, dir, flag, error) struct mbuf *m; u_int dir; int flag; int *error; { struct secpolicy *sp = NULL; /* sanity check */ if (m == NULL || error == NULL) panic("ipsec6_getpolicybyaddr: NULL pointer was passed.\n"); { struct secpolicyindex spidx; bzero(&spidx, sizeof(spidx)); /* make a index to look for a policy */ *error = ipsec_setspidx_mbuf(&spidx, dir, AF_INET6, m, (flag & IP_FORWARDING) ? 0 : 1); if (*error != 0) return NULL; sp = key_allocsp(&spidx, dir); } /* SP found */ if (sp != NULL) { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP ipsec6_getpolicybyaddr called " "to allocate SP:%p\n", sp)); *error = 0; return sp; } /* no SP found */ if (ip6_def_policy.policy != IPSEC_POLICY_DISCARD && ip6_def_policy.policy != IPSEC_POLICY_NONE) { ipseclog((LOG_INFO, "fixed system default policy: %d->%d\n", ip6_def_policy.policy, IPSEC_POLICY_NONE)); ip6_def_policy.policy = IPSEC_POLICY_NONE; } ip6_def_policy.refcnt++; *error = 0; return &ip6_def_policy; } #endif /* INET6 */ /* * set IP address into spidx from mbuf. * When Forwarding packet and ICMP echo reply, this function is used. * * IN: get the followings from mbuf. * protocol family, src, dst, next protocol * OUT: * 0: success. * other: failure, and set errno. */ int ipsec_setspidx_mbuf(spidx, dir, family, m, needport) struct secpolicyindex *spidx; u_int dir, family; struct mbuf *m; int needport; { int error; /* sanity check */ if (spidx == NULL || m == NULL) panic("ipsec_setspidx_mbuf: NULL pointer was passed.\n"); bzero(spidx, sizeof(*spidx)); error = ipsec_setspidx(m, spidx, needport); if (error) goto bad; spidx->dir = dir; return 0; bad: /* XXX initialize */ bzero(spidx, sizeof(*spidx)); return EINVAL; } static int ipsec4_setspidx_inpcb(m, pcb) struct mbuf *m; struct inpcb *pcb; { struct secpolicyindex *spidx; int error; /* sanity check */ if (pcb == NULL) panic("ipsec4_setspidx_inpcb: no PCB found.\n"); if (pcb->inp_sp == NULL) panic("ipsec4_setspidx_inpcb: no inp_sp found.\n"); if (pcb->inp_sp->sp_out == NULL || pcb->inp_sp->sp_in == NULL) panic("ipsec4_setspidx_inpcb: no sp_in/out found.\n"); bzero(&pcb->inp_sp->sp_in->spidx, sizeof(*spidx)); bzero(&pcb->inp_sp->sp_out->spidx, sizeof(*spidx)); spidx = &pcb->inp_sp->sp_in->spidx; error = ipsec_setspidx(m, spidx, 1); if (error) goto bad; spidx->dir = IPSEC_DIR_INBOUND; spidx = &pcb->inp_sp->sp_out->spidx; error = ipsec_setspidx(m, spidx, 1); if (error) goto bad; spidx->dir = IPSEC_DIR_OUTBOUND; return 0; bad: bzero(&pcb->inp_sp->sp_in->spidx, sizeof(*spidx)); bzero(&pcb->inp_sp->sp_out->spidx, sizeof(*spidx)); return error; } #ifdef INET6 static int ipsec6_setspidx_in6pcb(m, pcb) struct mbuf *m; struct in6pcb *pcb; { struct secpolicyindex *spidx; int error; /* sanity check */ if (pcb == NULL) panic("ipsec6_setspidx_in6pcb: no PCB found.\n"); if (pcb->in6p_sp == NULL) panic("ipsec6_setspidx_in6pcb: no in6p_sp found.\n"); if (pcb->in6p_sp->sp_out == NULL || pcb->in6p_sp->sp_in == NULL) panic("ipsec6_setspidx_in6pcb: no sp_in/out found.\n"); bzero(&pcb->in6p_sp->sp_in->spidx, sizeof(*spidx)); bzero(&pcb->in6p_sp->sp_out->spidx, sizeof(*spidx)); spidx = &pcb->in6p_sp->sp_in->spidx; error = ipsec_setspidx(m, spidx, 1); if (error) goto bad; spidx->dir = IPSEC_DIR_INBOUND; spidx = &pcb->in6p_sp->sp_out->spidx; error = ipsec_setspidx(m, spidx, 1); if (error) goto bad; spidx->dir = IPSEC_DIR_OUTBOUND; return 0; bad: bzero(&pcb->in6p_sp->sp_in->spidx, sizeof(*spidx)); bzero(&pcb->in6p_sp->sp_out->spidx, sizeof(*spidx)); return error; } #endif /* * configure security policy index (src/dst/proto/sport/dport) * by looking at the content of mbuf. * the caller is responsible for error recovery (like clearing up spidx). */ static int ipsec_setspidx(m, spidx, needport) struct mbuf *m; struct secpolicyindex *spidx; int needport; { struct ip *ip = NULL; struct ip ipbuf; u_int v; struct mbuf *n; int len; int error; if (m == NULL) panic("ipsec_setspidx: m == 0 passed.\n"); /* * validate m->m_pkthdr.len. we see incorrect length if we * mistakenly call this function with inconsistent mbuf chain * (like 4.4BSD tcp/udp processing). XXX should we panic here? */ len = 0; for (n = m; n; n = n->m_next) len += n->m_len; if (m->m_pkthdr.len != len) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("ipsec_setspidx: " "total of m_len(%d) != pkthdr.len(%d), " "ignored.\n", len, m->m_pkthdr.len)); return EINVAL; } if (m->m_pkthdr.len < sizeof(struct ip)) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("ipsec_setspidx: " "pkthdr.len(%d) < sizeof(struct ip), ignored.\n", m->m_pkthdr.len)); return EINVAL; } if (m->m_len >= sizeof(*ip)) ip = mtod(m, struct ip *); else { m_copydata(m, 0, sizeof(ipbuf), (caddr_t)&ipbuf); ip = &ipbuf; } #ifdef _IP_VHL v = _IP_VHL_V(ip->ip_vhl); #else v = ip->ip_v; #endif switch (v) { case 4: error = ipsec4_setspidx_ipaddr(m, spidx); if (error) return error; ipsec4_get_ulp(m, spidx, needport); return 0; #ifdef INET6 case 6: if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("ipsec_setspidx: " "pkthdr.len(%d) < sizeof(struct ip6_hdr), " "ignored.\n", m->m_pkthdr.len)); return EINVAL; } error = ipsec6_setspidx_ipaddr(m, spidx); if (error) return error; ipsec6_get_ulp(m, spidx, needport); return 0; #endif default: KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("ipsec_setspidx: " "unknown IP version %u, ignored.\n", v)); return EINVAL; } } static void ipsec4_get_ulp(m, spidx, needport) struct mbuf *m; struct secpolicyindex *spidx; int needport; { struct ip ip; struct ip6_ext ip6e; u_int8_t nxt; int off; struct tcphdr th; struct udphdr uh; /* sanity check */ if (m == NULL) panic("ipsec4_get_ulp: NULL pointer was passed.\n"); if (m->m_pkthdr.len < sizeof(ip)) panic("ipsec4_get_ulp: too short\n"); /* set default */ spidx->ul_proto = IPSEC_ULPROTO_ANY; ((struct sockaddr_in *)&spidx->src)->sin_port = IPSEC_PORT_ANY; ((struct sockaddr_in *)&spidx->dst)->sin_port = IPSEC_PORT_ANY; m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); /* ip_input() flips it into host endian XXX need more checking */ if (ip.ip_off & (IP_MF | IP_OFFMASK)) return; nxt = ip.ip_p; #ifdef _IP_VHL off = _IP_VHL_HL(ip->ip_vhl) << 2; #else off = ip.ip_hl << 2; #endif while (off < m->m_pkthdr.len) { switch (nxt) { case IPPROTO_TCP: spidx->ul_proto = nxt; if (!needport) return; if (off + sizeof(struct tcphdr) > m->m_pkthdr.len) return; m_copydata(m, off, sizeof(th), (caddr_t)&th); ((struct sockaddr_in *)&spidx->src)->sin_port = th.th_sport; ((struct sockaddr_in *)&spidx->dst)->sin_port = th.th_dport; return; case IPPROTO_UDP: spidx->ul_proto = nxt; if (!needport) return; if (off + sizeof(struct udphdr) > m->m_pkthdr.len) return; m_copydata(m, off, sizeof(uh), (caddr_t)&uh); ((struct sockaddr_in *)&spidx->src)->sin_port = uh.uh_sport; ((struct sockaddr_in *)&spidx->dst)->sin_port = uh.uh_dport; return; case IPPROTO_AH: if (m->m_pkthdr.len > off + sizeof(ip6e)) return; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); off += (ip6e.ip6e_len + 2) << 2; nxt = ip6e.ip6e_nxt; break; case IPPROTO_ICMP: default: /* XXX intermediate headers??? */ spidx->ul_proto = nxt; return; } } } /* assumes that m is sane */ static int ipsec4_setspidx_ipaddr(m, spidx) struct mbuf *m; struct secpolicyindex *spidx; { struct ip *ip = NULL; struct ip ipbuf; struct sockaddr_in *sin; if (m->m_len >= sizeof(*ip)) ip = mtod(m, struct ip *); else { m_copydata(m, 0, sizeof(ipbuf), (caddr_t)&ipbuf); ip = &ipbuf; } sin = (struct sockaddr_in *)&spidx->src; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); bcopy(&ip->ip_src, &sin->sin_addr, sizeof(ip->ip_src)); spidx->prefs = sizeof(struct in_addr) << 3; sin = (struct sockaddr_in *)&spidx->dst; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); bcopy(&ip->ip_dst, &sin->sin_addr, sizeof(ip->ip_dst)); spidx->prefd = sizeof(struct in_addr) << 3; return 0; } #ifdef INET6 static void ipsec6_get_ulp(m, spidx, needport) struct mbuf *m; struct secpolicyindex *spidx; int needport; { int off, nxt; struct tcphdr th; struct udphdr uh; /* sanity check */ if (m == NULL) panic("ipsec6_get_ulp: NULL pointer was passed.\n"); KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("ipsec6_get_ulp:\n"); kdebug_mbuf(m)); /* set default */ spidx->ul_proto = IPSEC_ULPROTO_ANY; ((struct sockaddr_in6 *)&spidx->src)->sin6_port = IPSEC_PORT_ANY; ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = IPSEC_PORT_ANY; nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off < 0 || m->m_pkthdr.len < off) return; switch (nxt) { case IPPROTO_TCP: spidx->ul_proto = nxt; if (!needport) break; if (off + sizeof(struct tcphdr) > m->m_pkthdr.len) break; m_copydata(m, off, sizeof(th), (caddr_t)&th); ((struct sockaddr_in6 *)&spidx->src)->sin6_port = th.th_sport; ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = th.th_dport; break; case IPPROTO_UDP: spidx->ul_proto = nxt; if (!needport) break; if (off + sizeof(struct udphdr) > m->m_pkthdr.len) break; m_copydata(m, off, sizeof(uh), (caddr_t)&uh); ((struct sockaddr_in6 *)&spidx->src)->sin6_port = uh.uh_sport; ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = uh.uh_dport; break; case IPPROTO_ICMPV6: default: /* XXX intermediate headers??? */ spidx->ul_proto = nxt; break; } } /* assumes that m is sane */ static int ipsec6_setspidx_ipaddr(m, spidx) struct mbuf *m; struct secpolicyindex *spidx; { struct ip6_hdr *ip6 = NULL; struct ip6_hdr ip6buf; struct sockaddr_in6 *sin6; if (m->m_len >= sizeof(*ip6)) ip6 = mtod(m, struct ip6_hdr *); else { m_copydata(m, 0, sizeof(ip6buf), (caddr_t)&ip6buf); ip6 = &ip6buf; } sin6 = (struct sockaddr_in6 *)&spidx->src; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); bcopy(&ip6->ip6_src, &sin6->sin6_addr, sizeof(ip6->ip6_src)); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]); } spidx->prefs = sizeof(struct in6_addr) << 3; sin6 = (struct sockaddr_in6 *)&spidx->dst; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(ip6->ip6_dst)); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) { sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]); } spidx->prefd = sizeof(struct in6_addr) << 3; return 0; } #endif static struct inpcbpolicy * ipsec_newpcbpolicy() { struct inpcbpolicy *p; p = (struct inpcbpolicy *)malloc(sizeof(*p), M_SECA, M_NOWAIT); return p; } static void ipsec_delpcbpolicy(p) struct inpcbpolicy *p; { free(p, M_SECA); } /* initialize policy in PCB */ int ipsec_init_policy(so, pcb_sp) struct socket *so; struct inpcbpolicy **pcb_sp; { struct inpcbpolicy *new; /* sanity check. */ if (so == NULL || pcb_sp == NULL) panic("ipsec_init_policy: NULL pointer was passed.\n"); new = ipsec_newpcbpolicy(); if (new == NULL) { ipseclog((LOG_DEBUG, "ipsec_init_policy: No more memory.\n")); return ENOBUFS; } bzero(new, sizeof(*new)); if (so->so_cred != 0 && so->so_cred->cr_uid == 0) new->priv = 1; else new->priv = 0; if ((new->sp_in = key_newsp()) == NULL) { ipsec_delpcbpolicy(new); return ENOBUFS; } new->sp_in->state = IPSEC_SPSTATE_ALIVE; new->sp_in->policy = IPSEC_POLICY_ENTRUST; if ((new->sp_out = key_newsp()) == NULL) { key_freesp(new->sp_in); ipsec_delpcbpolicy(new); return ENOBUFS; } new->sp_out->state = IPSEC_SPSTATE_ALIVE; new->sp_out->policy = IPSEC_POLICY_ENTRUST; *pcb_sp = new; return 0; } /* copy old ipsec policy into new */ int ipsec_copy_policy(old, new) struct inpcbpolicy *old, *new; { struct secpolicy *sp; sp = ipsec_deepcopy_policy(old->sp_in); if (sp) { key_freesp(new->sp_in); new->sp_in = sp; } else return ENOBUFS; sp = ipsec_deepcopy_policy(old->sp_out); if (sp) { key_freesp(new->sp_out); new->sp_out = sp; } else return ENOBUFS; new->priv = old->priv; return 0; } /* deep-copy a policy in PCB */ static struct secpolicy * ipsec_deepcopy_policy(src) struct secpolicy *src; { struct ipsecrequest *newchain = NULL; struct ipsecrequest *p; struct ipsecrequest **q; struct ipsecrequest *r; struct secpolicy *dst; dst = key_newsp(); if (src == NULL || dst == NULL) return NULL; /* * deep-copy IPsec request chain. This is required since struct * ipsecrequest is not reference counted. */ q = &newchain; for (p = src->req; p; p = p->next) { *q = (struct ipsecrequest *)malloc(sizeof(struct ipsecrequest), M_SECA, M_NOWAIT); if (*q == NULL) goto fail; bzero(*q, sizeof(**q)); (*q)->next = NULL; (*q)->saidx.proto = p->saidx.proto; (*q)->saidx.mode = p->saidx.mode; (*q)->level = p->level; (*q)->saidx.reqid = p->saidx.reqid; bcopy(&p->saidx.src, &(*q)->saidx.src, sizeof((*q)->saidx.src)); bcopy(&p->saidx.dst, &(*q)->saidx.dst, sizeof((*q)->saidx.dst)); (*q)->sav = NULL; (*q)->sp = dst; q = &((*q)->next); } dst->req = newchain; dst->state = src->state; dst->policy = src->policy; /* do not touch the refcnt fields */ return dst; fail: for (p = newchain; p; p = r) { r = p->next; free(p, M_SECA); p = NULL; } return NULL; } /* set policy and ipsec request if present. */ static int ipsec_set_policy(pcb_sp, optname, request, len, priv) struct secpolicy **pcb_sp; int optname; caddr_t request; size_t len; int priv; { struct sadb_x_policy *xpl; struct secpolicy *newsp = NULL; int error; /* sanity check. */ if (pcb_sp == NULL || *pcb_sp == NULL || request == NULL) return EINVAL; if (len < sizeof(*xpl)) return EINVAL; xpl = (struct sadb_x_policy *)request; KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("ipsec_set_policy: passed policy\n"); kdebug_sadb_x_policy((struct sadb_ext *)xpl)); /* check policy type */ /* ipsec_set_policy() accepts IPSEC, ENTRUST and BYPASS. */ if (xpl->sadb_x_policy_type == IPSEC_POLICY_DISCARD || xpl->sadb_x_policy_type == IPSEC_POLICY_NONE) return EINVAL; /* check privileged socket */ if (priv == 0 && xpl->sadb_x_policy_type == IPSEC_POLICY_BYPASS) return EACCES; /* allocation new SP entry */ if ((newsp = key_msg2sp(xpl, len, &error)) == NULL) return error; newsp->state = IPSEC_SPSTATE_ALIVE; /* clear old SP and set new SP */ key_freesp(*pcb_sp); *pcb_sp = newsp; KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("ipsec_set_policy: new policy\n"); kdebug_secpolicy(newsp)); return 0; } static int ipsec_get_policy(pcb_sp, mp) struct secpolicy *pcb_sp; struct mbuf **mp; { /* sanity check. */ if (pcb_sp == NULL || mp == NULL) return EINVAL; *mp = key_sp2msg(pcb_sp); if (!*mp) { ipseclog((LOG_DEBUG, "ipsec_get_policy: No more memory.\n")); return ENOBUFS; } (*mp)->m_type = MT_DATA; KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("ipsec_get_policy:\n"); kdebug_mbuf(*mp)); return 0; } int ipsec4_set_policy(inp, optname, request, len, priv) struct inpcb *inp; int optname; caddr_t request; size_t len; int priv; { struct sadb_x_policy *xpl; struct secpolicy **pcb_sp; /* sanity check. */ if (inp == NULL || request == NULL) return EINVAL; if (len < sizeof(*xpl)) return EINVAL; xpl = (struct sadb_x_policy *)request; /* select direction */ switch (xpl->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: pcb_sp = &inp->inp_sp->sp_in; break; case IPSEC_DIR_OUTBOUND: pcb_sp = &inp->inp_sp->sp_out; break; default: ipseclog((LOG_ERR, "ipsec4_set_policy: invalid direction=%u\n", xpl->sadb_x_policy_dir)); return EINVAL; } return ipsec_set_policy(pcb_sp, optname, request, len, priv); } int ipsec4_get_policy(inp, request, len, mp) struct inpcb *inp; caddr_t request; size_t len; struct mbuf **mp; { struct sadb_x_policy *xpl; struct secpolicy *pcb_sp; /* sanity check. */ if (inp == NULL || request == NULL || mp == NULL) return EINVAL; if (inp->inp_sp == NULL) panic("policy in PCB is NULL\n"); if (len < sizeof(*xpl)) return EINVAL; xpl = (struct sadb_x_policy *)request; /* select direction */ switch (xpl->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: pcb_sp = inp->inp_sp->sp_in; break; case IPSEC_DIR_OUTBOUND: pcb_sp = inp->inp_sp->sp_out; break; default: ipseclog((LOG_ERR, "ipsec4_set_policy: invalid direction=%u\n", xpl->sadb_x_policy_dir)); return EINVAL; } return ipsec_get_policy(pcb_sp, mp); } /* delete policy in PCB */ int ipsec4_delete_pcbpolicy(inp) struct inpcb *inp; { /* sanity check. */ if (inp == NULL) panic("ipsec4_delete_pcbpolicy: NULL pointer was passed.\n"); if (inp->inp_sp == NULL) return 0; if (inp->inp_sp->sp_in != NULL) { key_freesp(inp->inp_sp->sp_in); inp->inp_sp->sp_in = NULL; } if (inp->inp_sp->sp_out != NULL) { key_freesp(inp->inp_sp->sp_out); inp->inp_sp->sp_out = NULL; } ipsec_delpcbpolicy(inp->inp_sp); inp->inp_sp = NULL; return 0; } #ifdef INET6 int ipsec6_set_policy(in6p, optname, request, len, priv) struct in6pcb *in6p; int optname; caddr_t request; size_t len; int priv; { struct sadb_x_policy *xpl; struct secpolicy **pcb_sp; /* sanity check. */ if (in6p == NULL || request == NULL) return EINVAL; if (len < sizeof(*xpl)) return EINVAL; xpl = (struct sadb_x_policy *)request; /* select direction */ switch (xpl->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: pcb_sp = &in6p->in6p_sp->sp_in; break; case IPSEC_DIR_OUTBOUND: pcb_sp = &in6p->in6p_sp->sp_out; break; default: ipseclog((LOG_ERR, "ipsec6_set_policy: invalid direction=%u\n", xpl->sadb_x_policy_dir)); return EINVAL; } return ipsec_set_policy(pcb_sp, optname, request, len, priv); } int ipsec6_get_policy(in6p, request, len, mp) struct in6pcb *in6p; caddr_t request; size_t len; struct mbuf **mp; { struct sadb_x_policy *xpl; struct secpolicy *pcb_sp; /* sanity check. */ if (in6p == NULL || request == NULL || mp == NULL) return EINVAL; if (in6p->in6p_sp == NULL) panic("policy in PCB is NULL\n"); if (len < sizeof(*xpl)) return EINVAL; xpl = (struct sadb_x_policy *)request; /* select direction */ switch (xpl->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: pcb_sp = in6p->in6p_sp->sp_in; break; case IPSEC_DIR_OUTBOUND: pcb_sp = in6p->in6p_sp->sp_out; break; default: ipseclog((LOG_ERR, "ipsec6_set_policy: invalid direction=%u\n", xpl->sadb_x_policy_dir)); return EINVAL; } return ipsec_get_policy(pcb_sp, mp); } int ipsec6_delete_pcbpolicy(in6p) struct in6pcb *in6p; { /* sanity check. */ if (in6p == NULL) panic("ipsec6_delete_pcbpolicy: NULL pointer was passed.\n"); if (in6p->in6p_sp == NULL) return 0; if (in6p->in6p_sp->sp_in != NULL) { key_freesp(in6p->in6p_sp->sp_in); in6p->in6p_sp->sp_in = NULL; } if (in6p->in6p_sp->sp_out != NULL) { key_freesp(in6p->in6p_sp->sp_out); in6p->in6p_sp->sp_out = NULL; } ipsec_delpcbpolicy(in6p->in6p_sp); in6p->in6p_sp = NULL; return 0; } #endif /* * return current level. * Either IPSEC_LEVEL_USE or IPSEC_LEVEL_REQUIRE are always returned. */ u_int ipsec_get_reqlevel(isr) struct ipsecrequest *isr; { u_int level = 0; u_int esp_trans_deflev, esp_net_deflev, ah_trans_deflev, ah_net_deflev; /* sanity check */ if (isr == NULL || isr->sp == NULL) panic("ipsec_get_reqlevel: NULL pointer is passed.\n"); if (((struct sockaddr *)&isr->sp->spidx.src)->sa_family != ((struct sockaddr *)&isr->sp->spidx.dst)->sa_family) panic("ipsec_get_reqlevel: family mismatched.\n"); /* XXX note that we have ipseclog() expanded here - code sync issue */ #define IPSEC_CHECK_DEFAULT(lev) \ (((lev) != IPSEC_LEVEL_USE && (lev) != IPSEC_LEVEL_REQUIRE \ && (lev) != IPSEC_LEVEL_UNIQUE) \ ? (ipsec_debug \ ? log(LOG_INFO, "fixed system default level " #lev ":%d->%d\n",\ (lev), IPSEC_LEVEL_REQUIRE) \ : 0), \ (lev) = IPSEC_LEVEL_REQUIRE, \ (lev) \ : (lev)) /* set default level */ switch (((struct sockaddr *)&isr->sp->spidx.src)->sa_family) { #ifdef INET case AF_INET: esp_trans_deflev = IPSEC_CHECK_DEFAULT(ip4_esp_trans_deflev); esp_net_deflev = IPSEC_CHECK_DEFAULT(ip4_esp_net_deflev); ah_trans_deflev = IPSEC_CHECK_DEFAULT(ip4_ah_trans_deflev); ah_net_deflev = IPSEC_CHECK_DEFAULT(ip4_ah_net_deflev); break; #endif #ifdef INET6 case AF_INET6: esp_trans_deflev = IPSEC_CHECK_DEFAULT(ip6_esp_trans_deflev); esp_net_deflev = IPSEC_CHECK_DEFAULT(ip6_esp_net_deflev); ah_trans_deflev = IPSEC_CHECK_DEFAULT(ip6_ah_trans_deflev); ah_net_deflev = IPSEC_CHECK_DEFAULT(ip6_ah_net_deflev); break; #endif /* INET6 */ default: panic("key_get_reqlevel: Unknown family. %d\n", ((struct sockaddr *)&isr->sp->spidx.src)->sa_family); } #undef IPSEC_CHECK_DEFAULT /* set level */ switch (isr->level) { case IPSEC_LEVEL_DEFAULT: switch (isr->saidx.proto) { case IPPROTO_ESP: if (isr->saidx.mode == IPSEC_MODE_TUNNEL) level = esp_net_deflev; else level = esp_trans_deflev; break; case IPPROTO_AH: if (isr->saidx.mode == IPSEC_MODE_TUNNEL) level = ah_net_deflev; else level = ah_trans_deflev; case IPPROTO_IPCOMP: /* * we don't really care, as IPcomp document says that * we shouldn't compress small packets */ level = IPSEC_LEVEL_USE; break; default: panic("ipsec_get_reqlevel: " "Illegal protocol defined %u\n", isr->saidx.proto); } break; case IPSEC_LEVEL_USE: case IPSEC_LEVEL_REQUIRE: level = isr->level; break; case IPSEC_LEVEL_UNIQUE: level = IPSEC_LEVEL_REQUIRE; break; default: panic("ipsec_get_reqlevel: Illegal IPsec level %u\n", isr->level); } return level; } /* * Check AH/ESP integrity. * OUT: * 0: valid * 1: invalid */ static int ipsec_in_reject(sp, m) struct secpolicy *sp; struct mbuf *m; { struct ipsecrequest *isr; u_int level; int need_auth, need_conf, need_icv; KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("ipsec_in_reject: using SP\n"); kdebug_secpolicy(sp)); /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: return 1; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: return 0; case IPSEC_POLICY_IPSEC: break; case IPSEC_POLICY_ENTRUST: default: panic("ipsec_hdrsiz: Invalid policy found. %d\n", sp->policy); } need_auth = 0; need_conf = 0; need_icv = 0; /* XXX should compare policy against ipsec header history */ for (isr = sp->req; isr != NULL; isr = isr->next) { /* get current level */ level = ipsec_get_reqlevel(isr); switch (isr->saidx.proto) { case IPPROTO_ESP: if (level == IPSEC_LEVEL_REQUIRE) { need_conf++; if (isr->sav != NULL && isr->sav->flags == SADB_X_EXT_NONE && isr->sav->alg_auth != SADB_AALG_NONE) need_icv++; } break; case IPPROTO_AH: if (level == IPSEC_LEVEL_REQUIRE) { need_auth++; need_icv++; } break; case IPPROTO_IPCOMP: /* * we don't really care, as IPcomp document says that * we shouldn't compress small packets, IPComp policy * should always be treated as being in "use" level. */ break; } } KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("ipsec_in_reject: auth:%d conf:%d icv:%d m_flags:%x\n", need_auth, need_conf, need_icv, m->m_flags)); if ((need_conf && !(m->m_flags & M_DECRYPTED)) || (!need_auth && need_icv && !(m->m_flags & M_AUTHIPDGM)) || (need_auth && !(m->m_flags & M_AUTHIPHDR))) return 1; return 0; } /* * Check AH/ESP integrity. * This function is called from tcp_input(), udp_input(), * and {ah,esp}4_input for tunnel mode */ int ipsec4_in_reject_so(m, so) struct mbuf *m; struct socket *so; { struct secpolicy *sp = NULL; int error; int result; /* sanity check */ if (m == NULL) return 0; /* XXX should be panic ? */ /* get SP for this packet. * When we are called from ip_forward(), we call * ipsec4_getpolicybyaddr() with IP_FORWARDING flag. */ if (so == NULL) sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error); else sp = ipsec4_getpolicybysock(m, IPSEC_DIR_INBOUND, so, &error); if (sp == NULL) return 0; /* XXX should be panic ? * -> No, there may be error. */ result = ipsec_in_reject(sp, m); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP ipsec4_in_reject_so call free SP:%p\n", sp)); key_freesp(sp); return result; } int ipsec4_in_reject(m, inp) struct mbuf *m; struct inpcb *inp; { if (inp == NULL) return ipsec4_in_reject_so(m, NULL); if (inp->inp_socket) return ipsec4_in_reject_so(m, inp->inp_socket); else panic("ipsec4_in_reject: invalid inpcb/socket"); } #ifdef INET6 /* * Check AH/ESP integrity. * This function is called from tcp6_input(), udp6_input(), * and {ah,esp}6_input for tunnel mode */ int ipsec6_in_reject_so(m, so) struct mbuf *m; struct socket *so; { struct secpolicy *sp = NULL; int error; int result; /* sanity check */ if (m == NULL) return 0; /* XXX should be panic ? */ /* get SP for this packet. * When we are called from ip_forward(), we call * ipsec6_getpolicybyaddr() with IP_FORWARDING flag. */ if (so == NULL) sp = ipsec6_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error); else sp = ipsec6_getpolicybysock(m, IPSEC_DIR_INBOUND, so, &error); if (sp == NULL) return 0; /* XXX should be panic ? */ result = ipsec_in_reject(sp, m); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP ipsec6_in_reject_so call free SP:%p\n", sp)); key_freesp(sp); return result; } int ipsec6_in_reject(m, in6p) struct mbuf *m; struct in6pcb *in6p; { if (in6p == NULL) return ipsec6_in_reject_so(m, NULL); if (in6p->in6p_socket) return ipsec6_in_reject_so(m, in6p->in6p_socket); else panic("ipsec6_in_reject: invalid in6p/socket"); } #endif /* * compute the byte size to be occupied by IPsec header. * in case it is tunneled, it includes the size of outer IP header. * NOTE: SP passed is free in this function. */ static size_t ipsec_hdrsiz(sp) struct secpolicy *sp; { struct ipsecrequest *isr; size_t siz, clen; KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("ipsec_hdrsiz: using SP\n"); kdebug_secpolicy(sp)); /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: return 0; case IPSEC_POLICY_IPSEC: break; case IPSEC_POLICY_ENTRUST: default: panic("ipsec_hdrsiz: Invalid policy found. %d\n", sp->policy); } siz = 0; for (isr = sp->req; isr != NULL; isr = isr->next) { clen = 0; switch (isr->saidx.proto) { case IPPROTO_ESP: #ifdef IPSEC_ESP clen = esp_hdrsiz(isr); #else clen = 0; /* XXX */ #endif break; case IPPROTO_AH: clen = ah_hdrsiz(isr); break; case IPPROTO_IPCOMP: clen = sizeof(struct ipcomp); break; } if (isr->saidx.mode == IPSEC_MODE_TUNNEL) { switch (((struct sockaddr *)&isr->saidx.dst)->sa_family) { case AF_INET: clen += sizeof(struct ip); break; #ifdef INET6 case AF_INET6: clen += sizeof(struct ip6_hdr); break; #endif default: ipseclog((LOG_ERR, "ipsec_hdrsiz: " "unknown AF %d in IPsec tunnel SA\n", ((struct sockaddr *)&isr->saidx.dst)->sa_family)); break; } } siz += clen; } return siz; } /* This function is called from ip_forward() and ipsec4_hdrsize_tcp(). */ size_t ipsec4_hdrsiz(m, dir, inp) struct mbuf *m; u_int dir; struct inpcb *inp; { struct secpolicy *sp = NULL; int error; size_t size; /* sanity check */ if (m == NULL) return 0; /* XXX should be panic ? */ if (inp != NULL && inp->inp_socket == NULL) panic("ipsec4_hdrsize: why is socket NULL but there is PCB."); /* get SP for this packet. * When we are called from ip_forward(), we call * ipsec4_getpolicybyaddr() with IP_FORWARDING flag. */ if (inp == NULL) sp = ipsec4_getpolicybyaddr(m, dir, IP_FORWARDING, &error); else sp = ipsec4_getpolicybysock(m, dir, inp->inp_socket, &error); if (sp == NULL) return 0; /* XXX should be panic ? */ size = ipsec_hdrsiz(sp); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP ipsec4_hdrsiz call free SP:%p\n", sp)); KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("ipsec4_hdrsiz: size:%lu.\n", (unsigned long)size)); key_freesp(sp); return size; } #ifdef INET6 /* This function is called from ipsec6_hdrsize_tcp(), * and maybe from ip6_forward.() */ size_t ipsec6_hdrsiz(m, dir, in6p) struct mbuf *m; u_int dir; struct in6pcb *in6p; { struct secpolicy *sp = NULL; int error; size_t size; /* sanity check */ if (m == NULL) return 0; /* XXX shoud be panic ? */ if (in6p != NULL && in6p->in6p_socket == NULL) panic("ipsec6_hdrsize: why is socket NULL but there is PCB."); /* get SP for this packet */ /* XXX Is it right to call with IP_FORWARDING. */ if (in6p == NULL) sp = ipsec6_getpolicybyaddr(m, dir, IP_FORWARDING, &error); else sp = ipsec6_getpolicybysock(m, dir, in6p->in6p_socket, &error); if (sp == NULL) return 0; size = ipsec_hdrsiz(sp); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP ipsec6_hdrsiz call free SP:%p\n", sp)); KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("ipsec6_hdrsiz: size:%lu.\n", (unsigned long)size)); key_freesp(sp); return size; } #endif /* INET6 */ #ifdef INET /* * encapsulate for ipsec tunnel. * ip->ip_src must be fixed later on. */ static int ipsec4_encapsulate(m, sav) struct mbuf *m; struct secasvar *sav; { struct ip *oip; struct ip *ip; size_t hlen; size_t plen; /* can't tunnel between different AFs */ if (((struct sockaddr *)&sav->sah->saidx.src)->sa_family != ((struct sockaddr *)&sav->sah->saidx.dst)->sa_family || ((struct sockaddr *)&sav->sah->saidx.src)->sa_family != AF_INET) { m_freem(m); return EINVAL; } #if 0 /* XXX if the dst is myself, perform nothing. */ if (key_ismyaddr((struct sockaddr *)&sav->sah->saidx.dst)) { m_freem(m); return EINVAL; } #endif if (m->m_len < sizeof(*ip)) panic("ipsec4_encapsulate: assumption failed (first mbuf length)"); ip = mtod(m, struct ip *); #ifdef _IP_VHL hlen = _IP_VHL_HL(ip->ip_vhl) << 2; #else hlen = ip->ip_hl << 2; #endif if (m->m_len != hlen) panic("ipsec4_encapsulate: assumption failed (first mbuf length)"); /* generate header checksum */ ip->ip_sum = 0; #ifdef _IP_VHL if (ip->ip_vhl == IP_VHL_BORING) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(m, hlen); #else ip->ip_sum = in_cksum(m, hlen); #endif plen = m->m_pkthdr.len; /* * grow the mbuf to accomodate the new IPv4 header. * NOTE: IPv4 options will never be copied. */ if (M_LEADINGSPACE(m->m_next) < hlen) { struct mbuf *n; MGET(n, M_DONTWAIT, MT_DATA); if (!n) { m_freem(m); return ENOBUFS; } n->m_len = hlen; n->m_next = m->m_next; m->m_next = n; m->m_pkthdr.len += hlen; oip = mtod(n, struct ip *); } else { m->m_next->m_len += hlen; m->m_next->m_data -= hlen; m->m_pkthdr.len += hlen; oip = mtod(m->m_next, struct ip *); } ip = mtod(m, struct ip *); ovbcopy((caddr_t)ip, (caddr_t)oip, hlen); m->m_len = sizeof(struct ip); m->m_pkthdr.len -= (hlen - sizeof(struct ip)); /* construct new IPv4 header. see RFC 2401 5.1.2.1 */ /* ECN consideration. */ ip_ecn_ingress(ip4_ipsec_ecn, &ip->ip_tos, &oip->ip_tos); #ifdef _IP_VHL ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> 2); #else ip->ip_hl = sizeof(struct ip) >> 2; #endif ip->ip_off &= htons(~IP_OFFMASK); ip->ip_off &= htons(~IP_MF); switch (ip4_ipsec_dfbit) { case 0: /* clear DF bit */ ip->ip_off &= htons(~IP_DF); break; case 1: /* set DF bit */ ip->ip_off |= htons(IP_DF); break; default: /* copy DF bit */ break; } ip->ip_p = IPPROTO_IPIP; if (plen + sizeof(struct ip) < IP_MAXPACKET) ip->ip_len = htons(plen + sizeof(struct ip)); else { ipseclog((LOG_ERR, "IPv4 ipsec: size exceeds limit: " "leave ip_len as is (invalid packet)\n")); } #ifdef RANDOM_IP_ID ip->ip_id = ip_randomid(); #else ip->ip_id = htons(ip_id++); #endif bcopy(&((struct sockaddr_in *)&sav->sah->saidx.src)->sin_addr, &ip->ip_src, sizeof(ip->ip_src)); bcopy(&((struct sockaddr_in *)&sav->sah->saidx.dst)->sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)); ip->ip_ttl = IPDEFTTL; /* XXX Should ip_src be updated later ? */ return 0; } #endif /* INET */ #ifdef INET6 static int ipsec6_encapsulate(m, sav) struct mbuf *m; struct secasvar *sav; { struct ip6_hdr *oip6; struct ip6_hdr *ip6; size_t plen; /* can't tunnel between different AFs */ if (((struct sockaddr *)&sav->sah->saidx.src)->sa_family != ((struct sockaddr *)&sav->sah->saidx.dst)->sa_family || ((struct sockaddr *)&sav->sah->saidx.src)->sa_family != AF_INET6) { m_freem(m); return EINVAL; } #if 0 /* XXX if the dst is myself, perform nothing. */ if (key_ismyaddr((struct sockaddr *)&sav->sah->saidx.dst)) { m_freem(m); return EINVAL; } #endif plen = m->m_pkthdr.len; /* * grow the mbuf to accomodate the new IPv6 header. */ if (m->m_len != sizeof(struct ip6_hdr)) panic("ipsec6_encapsulate: assumption failed (first mbuf length)"); if (M_LEADINGSPACE(m->m_next) < sizeof(struct ip6_hdr)) { struct mbuf *n; MGET(n, M_DONTWAIT, MT_DATA); if (!n) { m_freem(m); return ENOBUFS; } n->m_len = sizeof(struct ip6_hdr); n->m_next = m->m_next; m->m_next = n; m->m_pkthdr.len += sizeof(struct ip6_hdr); oip6 = mtod(n, struct ip6_hdr *); } else { m->m_next->m_len += sizeof(struct ip6_hdr); m->m_next->m_data -= sizeof(struct ip6_hdr); m->m_pkthdr.len += sizeof(struct ip6_hdr); oip6 = mtod(m->m_next, struct ip6_hdr *); } ip6 = mtod(m, struct ip6_hdr *); ovbcopy((caddr_t)ip6, (caddr_t)oip6, sizeof(struct ip6_hdr)); /* Fake link-local scope-class addresses */ if (IN6_IS_SCOPE_LINKLOCAL(&oip6->ip6_src)) oip6->ip6_src.s6_addr16[1] = 0; if (IN6_IS_SCOPE_LINKLOCAL(&oip6->ip6_dst)) oip6->ip6_dst.s6_addr16[1] = 0; /* construct new IPv6 header. see RFC 2401 5.1.2.2 */ /* ECN consideration. */ ip6_ecn_ingress(ip6_ipsec_ecn, &ip6->ip6_flow, &oip6->ip6_flow); if (plen < IPV6_MAXPACKET - sizeof(struct ip6_hdr)) ip6->ip6_plen = htons(plen); else { /* ip6->ip6_plen will be updated in ip6_output() */ } ip6->ip6_nxt = IPPROTO_IPV6; bcopy(&((struct sockaddr_in6 *)&sav->sah->saidx.src)->sin6_addr, &ip6->ip6_src, sizeof(ip6->ip6_src)); bcopy(&((struct sockaddr_in6 *)&sav->sah->saidx.dst)->sin6_addr, &ip6->ip6_dst, sizeof(ip6->ip6_dst)); ip6->ip6_hlim = IPV6_DEFHLIM; /* XXX Should ip6_src be updated later ? */ return 0; } #endif /* INET6 */ /* * Check the variable replay window. * ipsec_chkreplay() performs replay check before ICV verification. * ipsec_updatereplay() updates replay bitmap. This must be called after * ICV verification (it also performs replay check, which is usually done * beforehand). * 0 (zero) is returned if packet disallowed, 1 if packet permitted. * * based on RFC 2401. */ int ipsec_chkreplay(seq, sav) u_int32_t seq; struct secasvar *sav; { const struct secreplay *replay; u_int32_t diff; int fr; u_int32_t wsizeb; /* constant: bits of window size */ int frlast; /* constant: last frame */ /* sanity check */ if (sav == NULL) panic("ipsec_chkreplay: NULL pointer was passed.\n"); replay = sav->replay; if (replay->wsize == 0) return 1; /* no need to check replay. */ /* constant */ frlast = replay->wsize - 1; wsizeb = replay->wsize << 3; /* sequence number of 0 is invalid */ if (seq == 0) return 0; /* first time is always okay */ if (replay->count == 0) return 1; if (seq > replay->lastseq) { /* larger sequences are okay */ return 1; } else { /* seq is equal or less than lastseq. */ diff = replay->lastseq - seq; /* over range to check, i.e. too old or wrapped */ if (diff >= wsizeb) return 0; fr = frlast - diff / 8; /* this packet already seen ? */ if ((replay->bitmap)[fr] & (1 << (diff % 8))) return 0; /* out of order but good */ return 1; } } /* * check replay counter whether to update or not. * OUT: 0: OK * 1: NG */ int ipsec_updatereplay(seq, sav) u_int32_t seq; struct secasvar *sav; { struct secreplay *replay; u_int32_t diff; int fr; u_int32_t wsizeb; /* constant: bits of window size */ int frlast; /* constant: last frame */ /* sanity check */ if (sav == NULL) panic("ipsec_chkreplay: NULL pointer was passed.\n"); replay = sav->replay; if (replay->wsize == 0) goto ok; /* no need to check replay. */ /* constant */ frlast = replay->wsize - 1; wsizeb = replay->wsize << 3; /* sequence number of 0 is invalid */ if (seq == 0) return 1; /* first time */ if (replay->count == 0) { replay->lastseq = seq; bzero(replay->bitmap, replay->wsize); (replay->bitmap)[frlast] = 1; goto ok; } if (seq > replay->lastseq) { /* seq is larger than lastseq. */ diff = seq - replay->lastseq; /* new larger sequence number */ if (diff < wsizeb) { /* In window */ /* set bit for this packet */ vshiftl(replay->bitmap, diff, replay->wsize); (replay->bitmap)[frlast] |= 1; } else { /* this packet has a "way larger" */ bzero(replay->bitmap, replay->wsize); (replay->bitmap)[frlast] = 1; } replay->lastseq = seq; /* larger is good */ } else { /* seq is equal or less than lastseq. */ diff = replay->lastseq - seq; /* over range to check, i.e. too old or wrapped */ if (diff >= wsizeb) return 1; fr = frlast - diff / 8; /* this packet already seen ? */ if ((replay->bitmap)[fr] & (1 << (diff % 8))) return 1; /* mark as seen */ (replay->bitmap)[fr] |= (1 << (diff % 8)); /* out of order but good */ } ok: if (replay->count == ~0) { /* set overflow flag */ replay->overflow++; /* don't increment, no more packets accepted */ if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0) return 1; ipseclog((LOG_WARNING, "replay counter made %d cycle. %s\n", replay->overflow, ipsec_logsastr(sav))); } replay->count++; return 0; } /* * shift variable length buffer to left. * IN: bitmap: pointer to the buffer * nbit: the number of to shift. * wsize: buffer size (bytes). */ static void vshiftl(bitmap, nbit, wsize) unsigned char *bitmap; int nbit, wsize; { int s, j, i; unsigned char over; for (j = 0; j < nbit; j += 8) { s = (nbit - j < 8) ? (nbit - j): 8; bitmap[0] <<= s; for (i = 1; i < wsize; i++) { over = (bitmap[i] >> (8 - s)); bitmap[i] <<= s; bitmap[i-1] |= over; } } return; } const char * ipsec4_logpacketstr(ip, spi) struct ip *ip; u_int32_t spi; { static char buf[256]; char *p; u_int8_t *s, *d; s = (u_int8_t *)(&ip->ip_src); d = (u_int8_t *)(&ip->ip_dst); p = buf; snprintf(buf, sizeof(buf), "packet(SPI=%u ", (u_int32_t)ntohl(spi)); while (p && *p) p++; snprintf(p, sizeof(buf) - (p - buf), "src=%u.%u.%u.%u", s[0], s[1], s[2], s[3]); while (p && *p) p++; snprintf(p, sizeof(buf) - (p - buf), " dst=%u.%u.%u.%u", d[0], d[1], d[2], d[3]); while (p && *p) p++; snprintf(p, sizeof(buf) - (p - buf), ")"); return buf; } #ifdef INET6 const char * ipsec6_logpacketstr(ip6, spi) struct ip6_hdr *ip6; u_int32_t spi; { static char buf[256]; char *p; p = buf; snprintf(buf, sizeof(buf), "packet(SPI=%u ", (u_int32_t)ntohl(spi)); while (p && *p) p++; snprintf(p, sizeof(buf) - (p - buf), "src=%s", ip6_sprintf(&ip6->ip6_src)); while (p && *p) p++; snprintf(p, sizeof(buf) - (p - buf), " dst=%s", ip6_sprintf(&ip6->ip6_dst)); while (p && *p) p++; snprintf(p, sizeof(buf) - (p - buf), ")"); return buf; } #endif /* INET6 */ const char * ipsec_logsastr(sav) struct secasvar *sav; { static char buf[256]; char *p; struct secasindex *saidx = &sav->sah->saidx; /* validity check */ if (((struct sockaddr *)&sav->sah->saidx.src)->sa_family != ((struct sockaddr *)&sav->sah->saidx.dst)->sa_family) panic("ipsec_logsastr: family mismatched.\n"); p = buf; snprintf(buf, sizeof(buf), "SA(SPI=%u ", (u_int32_t)ntohl(sav->spi)); while (p && *p) p++; if (((struct sockaddr *)&saidx->src)->sa_family == AF_INET) { u_int8_t *s, *d; s = (u_int8_t *)&((struct sockaddr_in *)&saidx->src)->sin_addr; d = (u_int8_t *)&((struct sockaddr_in *)&saidx->dst)->sin_addr; snprintf(p, sizeof(buf) - (p - buf), "src=%d.%d.%d.%d dst=%d.%d.%d.%d", s[0], s[1], s[2], s[3], d[0], d[1], d[2], d[3]); } #ifdef INET6 else if (((struct sockaddr *)&saidx->src)->sa_family == AF_INET6) { snprintf(p, sizeof(buf) - (p - buf), "src=%s", ip6_sprintf(&((struct sockaddr_in6 *)&saidx->src)->sin6_addr)); while (p && *p) p++; snprintf(p, sizeof(buf) - (p - buf), " dst=%s", ip6_sprintf(&((struct sockaddr_in6 *)&saidx->dst)->sin6_addr)); } #endif while (p && *p) p++; snprintf(p, sizeof(buf) - (p - buf), ")"); return buf; } void ipsec_dumpmbuf(m) struct mbuf *m; { int totlen; int i; u_char *p; totlen = 0; printf("---\n"); while (m) { p = mtod(m, u_char *); for (i = 0; i < m->m_len; i++) { printf("%02x ", p[i]); totlen++; if (totlen % 16 == 0) printf("\n"); } m = m->m_next; } if (totlen % 16 != 0) printf("\n"); printf("---\n"); } #ifdef INET /* * IPsec output logic for IPv4. */ int ipsec4_output(state, sp, flags) struct ipsec_output_state *state; struct secpolicy *sp; int flags; { struct ip *ip = NULL; struct ipsecrequest *isr = NULL; struct secasindex saidx; int s; int error; struct sockaddr_in *dst4; struct sockaddr_in *sin; if (!state) panic("state == NULL in ipsec4_output"); if (!state->m) panic("state->m == NULL in ipsec4_output"); if (!state->ro) panic("state->ro == NULL in ipsec4_output"); if (!state->dst) panic("state->dst == NULL in ipsec4_output"); KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("ipsec4_output: applyed SP\n"); kdebug_secpolicy(sp)); for (isr = sp->req; isr != NULL; isr = isr->next) { #if 0 /* give up to check restriction of transport mode */ /* XXX but should be checked somewhere */ /* * some of the IPsec operation must be performed only in * originating case. */ if (isr->saidx.mode == IPSEC_MODE_TRANSPORT && (flags & IP_FORWARDING)) continue; #endif /* make SA index for search proper SA */ ip = mtod(state->m, struct ip *); bcopy(&isr->saidx, &saidx, sizeof(saidx)); saidx.mode = isr->saidx.mode; saidx.reqid = isr->saidx.reqid; sin = (struct sockaddr_in *)&saidx.src; if (sin->sin_len == 0) { sin->sin_len = sizeof(*sin); sin->sin_family = AF_INET; sin->sin_port = IPSEC_PORT_ANY; bcopy(&ip->ip_src, &sin->sin_addr, sizeof(sin->sin_addr)); } sin = (struct sockaddr_in *)&saidx.dst; if (sin->sin_len == 0) { sin->sin_len = sizeof(*sin); sin->sin_family = AF_INET; sin->sin_port = IPSEC_PORT_ANY; bcopy(&ip->ip_dst, &sin->sin_addr, sizeof(sin->sin_addr)); } if ((error = key_checkrequest(isr, &saidx)) != 0) { /* * IPsec processing is required, but no SA found. * I assume that key_acquire() had been called * to get/establish the SA. Here I discard * this packet because it is responsibility for * upper layer to retransmit the packet. */ ipsecstat.out_nosa++; goto bad; } /* validity check */ if (isr->sav == NULL) { switch (ipsec_get_reqlevel(isr)) { case IPSEC_LEVEL_USE: continue; case IPSEC_LEVEL_REQUIRE: /* must be not reached here. */ panic("ipsec4_output: no SA found, but required."); } } /* * If there is no valid SA, we give up to process any * more. In such a case, the SA's status is changed * from DYING to DEAD after allocating. If a packet * send to the receiver by dead SA, the receiver can * not decode a packet because SA has been dead. */ if (isr->sav->state != SADB_SASTATE_MATURE && isr->sav->state != SADB_SASTATE_DYING) { ipsecstat.out_nosa++; error = EINVAL; goto bad; } /* * There may be the case that SA status will be changed when * we are refering to one. So calling splsoftnet(). */ s = splnet(); if (isr->saidx.mode == IPSEC_MODE_TUNNEL) { /* * build IPsec tunnel. */ /* XXX should be processed with other familiy */ if (((struct sockaddr *)&isr->sav->sah->saidx.src)->sa_family != AF_INET) { ipseclog((LOG_ERR, "ipsec4_output: " "family mismatched between inner and outer spi=%u\n", (u_int32_t)ntohl(isr->sav->spi))); splx(s); error = EAFNOSUPPORT; goto bad; } state->m = ipsec4_splithdr(state->m); if (!state->m) { splx(s); error = ENOMEM; goto bad; } error = ipsec4_encapsulate(state->m, isr->sav); splx(s); if (error) { state->m = NULL; goto bad; } ip = mtod(state->m, struct ip *); state->ro = &isr->sav->sah->sa_route; state->dst = (struct sockaddr *)&state->ro->ro_dst; dst4 = (struct sockaddr_in *)state->dst; if (state->ro->ro_rt && ((state->ro->ro_rt->rt_flags & RTF_UP) == 0 || dst4->sin_addr.s_addr != ip->ip_dst.s_addr)) { RTFREE(state->ro->ro_rt); state->ro->ro_rt = NULL; } if (state->ro->ro_rt == 0) { dst4->sin_family = AF_INET; dst4->sin_len = sizeof(*dst4); dst4->sin_addr = ip->ip_dst; rtalloc(state->ro); } if (state->ro->ro_rt == 0) { ipstat.ips_noroute++; error = EHOSTUNREACH; goto bad; } /* adjust state->dst if tunnel endpoint is offlink */ if (state->ro->ro_rt->rt_flags & RTF_GATEWAY) { state->dst = (struct sockaddr *)state->ro->ro_rt->rt_gateway; dst4 = (struct sockaddr_in *)state->dst; } } else splx(s); state->m = ipsec4_splithdr(state->m); if (!state->m) { error = ENOMEM; goto bad; } switch (isr->saidx.proto) { case IPPROTO_ESP: #ifdef IPSEC_ESP if ((error = esp4_output(state->m, isr)) != 0) { state->m = NULL; goto bad; } break; #else m_freem(state->m); state->m = NULL; error = EINVAL; goto bad; #endif case IPPROTO_AH: if ((error = ah4_output(state->m, isr)) != 0) { state->m = NULL; goto bad; } break; case IPPROTO_IPCOMP: if ((error = ipcomp4_output(state->m, isr)) != 0) { state->m = NULL; goto bad; } break; default: ipseclog((LOG_ERR, "ipsec4_output: unknown ipsec protocol %d\n", isr->saidx.proto)); m_freem(state->m); state->m = NULL; error = EINVAL; goto bad; } if (state->m == 0) { error = ENOMEM; goto bad; } ip = mtod(state->m, struct ip *); } return 0; bad: m_freem(state->m); state->m = NULL; return error; } #endif #ifdef INET6 /* * IPsec output logic for IPv6, transport mode. */ int ipsec6_output_trans(state, nexthdrp, mprev, sp, flags, tun) struct ipsec_output_state *state; u_char *nexthdrp; struct mbuf *mprev; struct secpolicy *sp; int flags; int *tun; { struct ip6_hdr *ip6; struct ipsecrequest *isr = NULL; struct secasindex saidx; int error = 0; int plen; struct sockaddr_in6 *sin6; if (!state) panic("state == NULL in ipsec6_output_trans"); if (!state->m) panic("state->m == NULL in ipsec6_output_trans"); if (!nexthdrp) panic("nexthdrp == NULL in ipsec6_output_trans"); if (!mprev) panic("mprev == NULL in ipsec6_output_trans"); if (!sp) panic("sp == NULL in ipsec6_output_trans"); if (!tun) panic("tun == NULL in ipsec6_output_trans"); KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("ipsec6_output_trans: applyed SP\n"); kdebug_secpolicy(sp)); *tun = 0; for (isr = sp->req; isr; isr = isr->next) { if (isr->saidx.mode == IPSEC_MODE_TUNNEL) { /* the rest will be handled by ipsec6_output_tunnel() */ break; } /* make SA index for search proper SA */ ip6 = mtod(state->m, struct ip6_hdr *); bcopy(&isr->saidx, &saidx, sizeof(saidx)); saidx.mode = isr->saidx.mode; saidx.reqid = isr->saidx.reqid; sin6 = (struct sockaddr_in6 *)&saidx.src; if (sin6->sin6_len == 0) { sin6->sin6_len = sizeof(*sin6); sin6->sin6_family = AF_INET6; sin6->sin6_port = IPSEC_PORT_ANY; bcopy(&ip6->ip6_src, &sin6->sin6_addr, sizeof(ip6->ip6_src)); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { /* fix scope id for comparing SPD */ sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]); } } sin6 = (struct sockaddr_in6 *)&saidx.dst; if (sin6->sin6_len == 0) { sin6->sin6_len = sizeof(*sin6); sin6->sin6_family = AF_INET6; sin6->sin6_port = IPSEC_PORT_ANY; bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(ip6->ip6_dst)); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) { /* fix scope id for comparing SPD */ sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]); } } if (key_checkrequest(isr, &saidx) == ENOENT) { /* * IPsec processing is required, but no SA found. * I assume that key_acquire() had been called * to get/establish the SA. Here I discard * this packet because it is responsibility for * upper layer to retransmit the packet. */ ipsec6stat.out_nosa++; error = ENOENT; /* * Notify the fact that the packet is discarded * to ourselves. I believe this is better than * just silently discarding. (jinmei@kame.net) * XXX: should we restrict the error to TCP packets? * XXX: should we directly notify sockets via * pfctlinputs? */ icmp6_error(state->m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADMIN, 0); state->m = NULL; /* icmp6_error freed the mbuf */ goto bad; } /* validity check */ if (isr->sav == NULL) { switch (ipsec_get_reqlevel(isr)) { case IPSEC_LEVEL_USE: continue; case IPSEC_LEVEL_REQUIRE: /* must be not reached here. */ panic("ipsec6_output_trans: no SA found, but required."); } } /* * If there is no valid SA, we give up to process. * see same place at ipsec4_output(). */ if (isr->sav->state != SADB_SASTATE_MATURE && isr->sav->state != SADB_SASTATE_DYING) { ipsec6stat.out_nosa++; error = EINVAL; goto bad; } switch (isr->saidx.proto) { case IPPROTO_ESP: #ifdef IPSEC_ESP error = esp6_output(state->m, nexthdrp, mprev->m_next, isr); #else m_freem(state->m); error = EINVAL; #endif break; case IPPROTO_AH: error = ah6_output(state->m, nexthdrp, mprev->m_next, isr); break; case IPPROTO_IPCOMP: error = ipcomp6_output(state->m, nexthdrp, mprev->m_next, isr); break; default: ipseclog((LOG_ERR, "ipsec6_output_trans: " "unknown ipsec protocol %d\n", isr->saidx.proto)); m_freem(state->m); ipsec6stat.out_inval++; error = EINVAL; break; } if (error) { state->m = NULL; goto bad; } plen = state->m->m_pkthdr.len - sizeof(struct ip6_hdr); if (plen > IPV6_MAXPACKET) { ipseclog((LOG_ERR, "ipsec6_output_trans: " "IPsec with IPv6 jumbogram is not supported\n")); ipsec6stat.out_inval++; error = EINVAL; /* XXX */ goto bad; } ip6 = mtod(state->m, struct ip6_hdr *); ip6->ip6_plen = htons(plen); } /* if we have more to go, we need a tunnel mode processing */ if (isr != NULL) *tun = 1; return 0; bad: m_freem(state->m); state->m = NULL; return error; } /* * IPsec output logic for IPv6, tunnel mode. */ int ipsec6_output_tunnel(state, sp, flags) struct ipsec_output_state *state; struct secpolicy *sp; int flags; { struct ip6_hdr *ip6; struct ipsecrequest *isr = NULL; struct secasindex saidx; int error = 0; int plen; struct sockaddr_in6* dst6; int s; if (!state) panic("state == NULL in ipsec6_output_tunnel"); if (!state->m) panic("state->m == NULL in ipsec6_output_tunnel"); if (!sp) panic("sp == NULL in ipsec6_output_tunnel"); KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("ipsec6_output_tunnel: applyed SP\n"); kdebug_secpolicy(sp)); /* * transport mode ipsec (before the 1st tunnel mode) is already * processed by ipsec6_output_trans(). */ for (isr = sp->req; isr; isr = isr->next) { if (isr->saidx.mode == IPSEC_MODE_TUNNEL) break; } for (/* already initialized */; isr; isr = isr->next) { if (isr->saidx.mode == IPSEC_MODE_TUNNEL) { /* When tunnel mode, SA peers must be specified. */ bcopy(&isr->saidx, &saidx, sizeof(saidx)); } else { /* make SA index to look for a proper SA */ struct sockaddr_in6 *sin6; bzero(&saidx, sizeof(saidx)); saidx.proto = isr->saidx.proto; saidx.mode = isr->saidx.mode; saidx.reqid = isr->saidx.reqid; ip6 = mtod(state->m, struct ip6_hdr *); sin6 = (struct sockaddr_in6 *)&saidx.src; if (sin6->sin6_len == 0) { sin6->sin6_len = sizeof(*sin6); sin6->sin6_family = AF_INET6; sin6->sin6_port = IPSEC_PORT_ANY; bcopy(&ip6->ip6_src, &sin6->sin6_addr, sizeof(ip6->ip6_src)); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { /* fix scope id for comparing SPD */ sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]); } } sin6 = (struct sockaddr_in6 *)&saidx.dst; if (sin6->sin6_len == 0) { sin6->sin6_len = sizeof(*sin6); sin6->sin6_family = AF_INET6; sin6->sin6_port = IPSEC_PORT_ANY; bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(ip6->ip6_dst)); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) { /* fix scope id for comparing SPD */ sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]); } } } if (key_checkrequest(isr, &saidx) == ENOENT) { /* * IPsec processing is required, but no SA found. * I assume that key_acquire() had been called * to get/establish the SA. Here I discard * this packet because it is responsibility for * upper layer to retransmit the packet. */ ipsec6stat.out_nosa++; error = ENOENT; goto bad; } /* validity check */ if (isr->sav == NULL) { switch (ipsec_get_reqlevel(isr)) { case IPSEC_LEVEL_USE: continue; case IPSEC_LEVEL_REQUIRE: /* must be not reached here. */ panic("ipsec6_output_tunnel: no SA found, but required."); } } /* * If there is no valid SA, we give up to process. * see same place at ipsec4_output(). */ if (isr->sav->state != SADB_SASTATE_MATURE && isr->sav->state != SADB_SASTATE_DYING) { ipsec6stat.out_nosa++; error = EINVAL; goto bad; } /* * There may be the case that SA status will be changed when * we are refering to one. So calling splsoftnet(). */ s = splnet(); if (isr->saidx.mode == IPSEC_MODE_TUNNEL) { /* * build IPsec tunnel. */ /* XXX should be processed with other familiy */ if (((struct sockaddr *)&isr->sav->sah->saidx.src)->sa_family != AF_INET6) { ipseclog((LOG_ERR, "ipsec6_output_tunnel: " "family mismatched between inner and outer, spi=%u\n", (u_int32_t)ntohl(isr->sav->spi))); splx(s); ipsec6stat.out_inval++; error = EAFNOSUPPORT; goto bad; } state->m = ipsec6_splithdr(state->m); if (!state->m) { splx(s); ipsec6stat.out_nomem++; error = ENOMEM; goto bad; } error = ipsec6_encapsulate(state->m, isr->sav); splx(s); if (error) { state->m = 0; goto bad; } ip6 = mtod(state->m, struct ip6_hdr *); state->ro = &isr->sav->sah->sa_route; state->dst = (struct sockaddr *)&state->ro->ro_dst; dst6 = (struct sockaddr_in6 *)state->dst; if (state->ro->ro_rt && ((state->ro->ro_rt->rt_flags & RTF_UP) == 0 || !IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr, &ip6->ip6_dst))) { RTFREE(state->ro->ro_rt); state->ro->ro_rt = NULL; } if (state->ro->ro_rt == 0) { bzero(dst6, sizeof(*dst6)); dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof(*dst6); dst6->sin6_addr = ip6->ip6_dst; rtalloc(state->ro); } if (state->ro->ro_rt == 0) { ip6stat.ip6s_noroute++; ipsec6stat.out_noroute++; error = EHOSTUNREACH; goto bad; } /* adjust state->dst if tunnel endpoint is offlink */ if (state->ro->ro_rt->rt_flags & RTF_GATEWAY) { state->dst = (struct sockaddr *)state->ro->ro_rt->rt_gateway; dst6 = (struct sockaddr_in6 *)state->dst; } } else splx(s); state->m = ipsec6_splithdr(state->m); if (!state->m) { ipsec6stat.out_nomem++; error = ENOMEM; goto bad; } ip6 = mtod(state->m, struct ip6_hdr *); switch (isr->saidx.proto) { case IPPROTO_ESP: #ifdef IPSEC_ESP error = esp6_output(state->m, &ip6->ip6_nxt, state->m->m_next, isr); #else m_freem(state->m); error = EINVAL; #endif break; case IPPROTO_AH: error = ah6_output(state->m, &ip6->ip6_nxt, state->m->m_next, isr); break; case IPPROTO_IPCOMP: /* XXX code should be here */ /* FALLTHROUGH */ default: ipseclog((LOG_ERR, "ipsec6_output_tunnel: " "unknown ipsec protocol %d\n", isr->saidx.proto)); m_freem(state->m); ipsec6stat.out_inval++; error = EINVAL; break; } if (error) { state->m = NULL; goto bad; } plen = state->m->m_pkthdr.len - sizeof(struct ip6_hdr); if (plen > IPV6_MAXPACKET) { ipseclog((LOG_ERR, "ipsec6_output_tunnel: " "IPsec with IPv6 jumbogram is not supported\n")); ipsec6stat.out_inval++; error = EINVAL; /* XXX */ goto bad; } ip6 = mtod(state->m, struct ip6_hdr *); ip6->ip6_plen = htons(plen); } return 0; bad: m_freem(state->m); state->m = NULL; return error; } #endif /* INET6 */ #ifdef INET /* * Chop IP header and option off from the payload. */ static struct mbuf * ipsec4_splithdr(m) struct mbuf *m; { struct mbuf *mh; struct ip *ip; int hlen; if (m->m_len < sizeof(struct ip)) panic("ipsec4_splithdr: first mbuf too short"); ip = mtod(m, struct ip *); #ifdef _IP_VHL hlen = _IP_VHL_HL(ip->ip_vhl) << 2; #else hlen = ip->ip_hl << 2; #endif if (m->m_len > hlen) { MGETHDR(mh, M_DONTWAIT, MT_HEADER); if (!mh) { m_freem(m); return NULL; } M_COPY_PKTHDR(mh, m); MH_ALIGN(mh, hlen); m->m_flags &= ~M_PKTHDR; m->m_len -= hlen; m->m_data += hlen; mh->m_next = m; m = mh; m->m_len = hlen; bcopy((caddr_t)ip, mtod(m, caddr_t), hlen); } else if (m->m_len < hlen) { m = m_pullup(m, hlen); if (!m) return NULL; } return m; } #endif #ifdef INET6 static struct mbuf * ipsec6_splithdr(m) struct mbuf *m; { struct mbuf *mh; struct ip6_hdr *ip6; int hlen; if (m->m_len < sizeof(struct ip6_hdr)) panic("ipsec6_splithdr: first mbuf too short"); ip6 = mtod(m, struct ip6_hdr *); hlen = sizeof(struct ip6_hdr); if (m->m_len > hlen) { MGETHDR(mh, M_DONTWAIT, MT_HEADER); if (!mh) { m_freem(m); return NULL; } M_COPY_PKTHDR(mh, m); MH_ALIGN(mh, hlen); m->m_flags &= ~M_PKTHDR; m->m_len -= hlen; m->m_data += hlen; mh->m_next = m; m = mh; m->m_len = hlen; bcopy((caddr_t)ip6, mtod(m, caddr_t), hlen); } else if (m->m_len < hlen) { m = m_pullup(m, hlen); if (!m) return NULL; } return m; } #endif /* validate inbound IPsec tunnel packet. */ int ipsec4_tunnel_validate(m, off, nxt0, sav) struct mbuf *m; /* no pullup permitted, m->m_len >= ip */ int off; u_int nxt0; struct secasvar *sav; { u_int8_t nxt = nxt0 & 0xff; struct sockaddr_in *sin; struct sockaddr_in osrc, odst, isrc, idst; int hlen; struct secpolicy *sp; struct ip *oip; #ifdef DIAGNOSTIC if (m->m_len < sizeof(struct ip)) panic("too short mbuf on ipsec4_tunnel_validate"); #endif if (nxt != IPPROTO_IPV4) return 0; if (m->m_pkthdr.len < off + sizeof(struct ip)) return 0; /* do not decapsulate if the SA is for transport mode only */ if (sav->sah->saidx.mode == IPSEC_MODE_TRANSPORT) return 0; oip = mtod(m, struct ip *); #ifdef _IP_VHL hlen = _IP_VHL_HL(oip->ip_vhl) << 2; #else hlen = oip->ip_hl << 2; #endif if (hlen != sizeof(struct ip)) return 0; /* AF_INET6 should be supported, but at this moment we don't. */ sin = (struct sockaddr_in *)&sav->sah->saidx.dst; if (sin->sin_family != AF_INET) return 0; if (bcmp(&oip->ip_dst, &sin->sin_addr, sizeof(oip->ip_dst)) != 0) return 0; /* XXX slow */ bzero(&osrc, sizeof(osrc)); bzero(&odst, sizeof(odst)); bzero(&isrc, sizeof(isrc)); bzero(&idst, sizeof(idst)); osrc.sin_family = odst.sin_family = isrc.sin_family = idst.sin_family = AF_INET; osrc.sin_len = odst.sin_len = isrc.sin_len = idst.sin_len = sizeof(struct sockaddr_in); osrc.sin_addr = oip->ip_src; odst.sin_addr = oip->ip_dst; m_copydata(m, off + offsetof(struct ip, ip_src), sizeof(isrc.sin_addr), (caddr_t)&isrc.sin_addr); m_copydata(m, off + offsetof(struct ip, ip_dst), sizeof(idst.sin_addr), (caddr_t)&idst.sin_addr); /* * RFC2401 5.2.1 (b): (assume that we are using tunnel mode) * - if the inner destination is multicast address, there can be * multiple permissible inner source address. implementation * may want to skip verification of inner source address against * SPD selector. * - if the inner protocol is ICMP, the packet may be an error report * from routers on the other side of the VPN cloud (R in the * following diagram). in this case, we cannot verify inner source * address against SPD selector. * me -- gw === gw -- R -- you * * we consider the first bullet to be users responsibility on SPD entry * configuration (if you need to encrypt multicast traffic, set * the source range of SPD selector to 0.0.0.0/0, or have explicit * address ranges for possible senders). * the second bullet is not taken care of (yet). * * therefore, we do not do anything special about inner source. */ sp = key_gettunnel((struct sockaddr *)&osrc, (struct sockaddr *)&odst, (struct sockaddr *)&isrc, (struct sockaddr *)&idst); if (!sp) return 0; key_freesp(sp); return 1; } #ifdef INET6 /* validate inbound IPsec tunnel packet. */ int ipsec6_tunnel_validate(m, off, nxt0, sav) struct mbuf *m; /* no pullup permitted, m->m_len >= ip */ int off; u_int nxt0; struct secasvar *sav; { u_int8_t nxt = nxt0 & 0xff; struct sockaddr_in6 *sin6; struct sockaddr_in6 osrc, odst, isrc, idst; struct secpolicy *sp; struct ip6_hdr *oip6; #ifdef DIAGNOSTIC if (m->m_len < sizeof(struct ip6_hdr)) panic("too short mbuf on ipsec6_tunnel_validate"); #endif if (nxt != IPPROTO_IPV6) return 0; if (m->m_pkthdr.len < off + sizeof(struct ip6_hdr)) return 0; /* do not decapsulate if the SA is for transport mode only */ if (sav->sah->saidx.mode == IPSEC_MODE_TRANSPORT) return 0; oip6 = mtod(m, struct ip6_hdr *); /* AF_INET should be supported, but at this moment we don't. */ sin6 = (struct sockaddr_in6 *)&sav->sah->saidx.dst; if (sin6->sin6_family != AF_INET6) return 0; if (!IN6_ARE_ADDR_EQUAL(&oip6->ip6_dst, &sin6->sin6_addr)) return 0; /* XXX slow */ bzero(&osrc, sizeof(osrc)); bzero(&odst, sizeof(odst)); bzero(&isrc, sizeof(isrc)); bzero(&idst, sizeof(idst)); osrc.sin6_family = odst.sin6_family = isrc.sin6_family = idst.sin6_family = AF_INET6; osrc.sin6_len = odst.sin6_len = isrc.sin6_len = idst.sin6_len = sizeof(struct sockaddr_in6); osrc.sin6_addr = oip6->ip6_src; odst.sin6_addr = oip6->ip6_dst; m_copydata(m, off + offsetof(struct ip6_hdr, ip6_src), sizeof(isrc.sin6_addr), (caddr_t)&isrc.sin6_addr); m_copydata(m, off + offsetof(struct ip6_hdr, ip6_dst), sizeof(idst.sin6_addr), (caddr_t)&idst.sin6_addr); /* * regarding to inner source address validation, see a long comment * in ipsec4_tunnel_validate. */ sp = key_gettunnel((struct sockaddr *)&osrc, (struct sockaddr *)&odst, (struct sockaddr *)&isrc, (struct sockaddr *)&idst); /* * when there is no suitable inbound policy for the packet of the ipsec * tunnel mode, the kernel never decapsulate the tunneled packet * as the ipsec tunnel mode even when the system wide policy is "none". * then the kernel leaves the generic tunnel module to process this * packet. if there is no rule of the generic tunnel, the packet * is rejected and the statistics will be counted up. */ if (!sp) return 0; key_freesp(sp); return 1; } #endif /* * Make a mbuf chain for encryption. * If the original mbuf chain contains a mbuf with a cluster, * allocate a new cluster and copy the data to the new cluster. * XXX: this hack is inefficient, but is necessary to handle cases * of TCP retransmission... */ struct mbuf * ipsec_copypkt(m) struct mbuf *m; { struct mbuf *n, **mpp, *mnew; for (n = m, mpp = &m; n; n = n->m_next) { if (n->m_flags & M_EXT) { /* * Make a copy only if there are more than one * references to the cluster. * XXX: is this approach effective? */ if (n->m_ext.ext_type != EXT_CLUSTER || MEXT_IS_REF(n)) { int remain, copied; struct mbuf *mm; if (n->m_flags & M_PKTHDR) { MGETHDR(mnew, M_DONTWAIT, MT_HEADER); if (mnew == NULL) goto fail; mnew->m_pkthdr = n->m_pkthdr; #if 0 if (n->m_pkthdr.aux) { mnew->m_pkthdr.aux = m_copym(n->m_pkthdr.aux, 0, M_COPYALL, M_DONTWAIT); } #endif M_COPY_PKTHDR(mnew, n); mnew->m_flags = n->m_flags & M_COPYFLAGS; } else { MGET(mnew, M_DONTWAIT, MT_DATA); if (mnew == NULL) goto fail; } mnew->m_len = 0; mm = mnew; /* * Copy data. If we don't have enough space to * store the whole data, allocate a cluster * or additional mbufs. * XXX: we don't use m_copyback(), since the * function does not use clusters and thus is * inefficient. */ remain = n->m_len; copied = 0; while (1) { int len; struct mbuf *mn; if (remain <= (mm->m_flags & M_PKTHDR ? MHLEN : MLEN)) len = remain; else { /* allocate a cluster */ MCLGET(mm, M_DONTWAIT); if (!(mm->m_flags & M_EXT)) { m_free(mm); goto fail; } len = remain < MCLBYTES ? remain : MCLBYTES; } bcopy(n->m_data + copied, mm->m_data, len); copied += len; remain -= len; mm->m_len = len; if (remain <= 0) /* completed? */ break; /* need another mbuf */ MGETHDR(mn, M_DONTWAIT, MT_HEADER); if (mn == NULL) goto fail; mn->m_pkthdr.rcvif = NULL; mm->m_next = mn; mm = mn; } /* adjust chain */ mm->m_next = m_free(n); n = mm; *mpp = mnew; mpp = &n->m_next; continue; } } *mpp = n; mpp = &n->m_next; } return(m); fail: m_freem(m); return(NULL); } -static struct mbuf * -ipsec_addaux(m) - struct mbuf *m; -{ - struct mbuf *n; - - n = m_aux_find(m, AF_INET, IPPROTO_ESP); - if (!n) - n = m_aux_add(m, AF_INET, IPPROTO_ESP); - if (!n) - return n; /* ENOBUFS */ - n->m_len = sizeof(struct socket *); - bzero(mtod(n, void *), n->m_len); - return n; -} - -static struct mbuf * -ipsec_findaux(m) - struct mbuf *m; -{ - struct mbuf *n; - - n = m_aux_find(m, AF_INET, IPPROTO_ESP); -#ifdef DIAGNOSTIC - if (n && n->m_len < sizeof(struct socket *)) - panic("invalid ipsec m_aux"); -#endif - return n; -} - void ipsec_delaux(m) struct mbuf *m; { - struct mbuf *n; + struct m_tag *tag; - n = m_aux_find(m, AF_INET, IPPROTO_ESP); - if (n) - m_aux_delete(m, n); + while ((tag = m_tag_find(m, PACKET_TAG_IPSEC_HISTORY, NULL)) != NULL) + m_tag_delete(m, tag); } -/* if the aux buffer is unnecessary, nuke it. */ -static void -ipsec_optaux(m, n) - struct mbuf *m; - struct mbuf *n; -{ - - if (!n) - return; - if (n->m_len == sizeof(struct socket *) && !*mtod(n, struct socket **)) - ipsec_delaux(m); -} - int -ipsec_setsocket(m, so) - struct mbuf *m; - struct socket *so; -{ - struct mbuf *n; - - /* if so == NULL, don't insist on getting the aux mbuf */ - if (so) { - n = ipsec_addaux(m); - if (!n) - return ENOBUFS; - } else - n = ipsec_findaux(m); - if (n && n->m_len >= sizeof(struct socket *)) - *mtod(n, struct socket **) = so; - ipsec_optaux(m, n); - return 0; -} - -struct socket * -ipsec_getsocket(m) - struct mbuf *m; -{ - struct mbuf *n; - - n = ipsec_findaux(m); - if (n && n->m_len >= sizeof(struct socket *)) - return *mtod(n, struct socket **); - else - return NULL; -} - -int ipsec_addhist(m, proto, spi) struct mbuf *m; int proto; u_int32_t spi; { - struct mbuf *n; + struct m_tag *tag; struct ipsec_history *p; - n = ipsec_addaux(m); - if (!n) + tag = m_tag_get(PACKET_TAG_IPSEC_HISTORY, + sizeof (struct ipsec_history), M_NOWAIT); + if (tag == NULL) return ENOBUFS; - if (M_TRAILINGSPACE(n) < sizeof(*p)) - return ENOSPC; /* XXX */ - p = (struct ipsec_history *)(mtod(n, caddr_t) + n->m_len); - n->m_len += sizeof(*p); + p = (struct ipsec_history *)(tag+1); bzero(p, sizeof(*p)); p->ih_proto = proto; p->ih_spi = spi; + m_tag_prepend(m, tag); return 0; } struct ipsec_history * ipsec_gethist(m, lenp) struct mbuf *m; int *lenp; { - struct mbuf *n; - int l; + struct m_tag *tag; - n = ipsec_findaux(m); - if (!n) + tag = m_tag_find(m, PACKET_TAG_IPSEC_HISTORY, NULL); + if (tag == NULL) return NULL; - l = n->m_len; - if (sizeof(struct socket *) > l) - return NULL; - if ((l - sizeof(struct socket *)) % sizeof(struct ipsec_history)) - return NULL; - /* XXX does it make more sense to divide by sizeof(ipsec_history)? */ + /* XXX NB: noone uses this so fake it */ if (lenp) - *lenp = l - sizeof(struct socket *); - return (struct ipsec_history *) - (mtod(n, caddr_t) + sizeof(struct socket *)); -} - -void -ipsec_clearhist(m) - struct mbuf *m; -{ - struct mbuf *n; - - n = ipsec_findaux(m); - if ((n) && n->m_len > sizeof(struct socket *)) - n->m_len = sizeof(struct socket *); - ipsec_optaux(m, n); + *lenp = sizeof (struct ipsec_history); + return ((struct ipsec_history *)(tag+1)); } Index: head/sys/netinet6/ipsec.h =================================================================== --- head/sys/netinet6/ipsec.h (revision 105193) +++ head/sys/netinet6/ipsec.h (revision 105194) @@ -1,354 +1,351 @@ /* $FreeBSD$ */ /* $KAME: ipsec.h,v 1.53 2001/11/20 08:32:38 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPsec controller part. */ #ifndef _NETINET6_IPSEC_H_ #define _NETINET6_IPSEC_H_ #if defined(_KERNEL) && !defined(_LKM) && !defined(KLD_MODULE) #include "opt_inet.h" #include "opt_ipsec.h" #endif #include #include #ifdef _KERNEL /* * Security Policy Index * Ensure that both address families in the "src" and "dst" are same. * When the value of the ul_proto is ICMPv6, the port field in "src" * specifies ICMPv6 type, and the port field in "dst" specifies ICMPv6 code. */ struct secpolicyindex { u_int8_t dir; /* direction of packet flow, see blow */ struct sockaddr_storage src; /* IP src address for SP */ struct sockaddr_storage dst; /* IP dst address for SP */ u_int8_t prefs; /* prefix length in bits for src */ u_int8_t prefd; /* prefix length in bits for dst */ u_int16_t ul_proto; /* upper layer Protocol */ #ifdef notyet uid_t uids; uid_t uidd; gid_t gids; gid_t gidd; #endif }; /* Security Policy Data Base */ struct secpolicy { LIST_ENTRY(secpolicy) chain; int refcnt; /* reference count */ struct secpolicyindex spidx; /* selector */ u_int32_t id; /* It's unique number on the system. */ u_int state; /* 0: dead, others: alive */ #define IPSEC_SPSTATE_DEAD 0 #define IPSEC_SPSTATE_ALIVE 1 u_int policy; /* DISCARD, NONE or IPSEC, see keyv2.h */ struct ipsecrequest *req; /* pointer to the ipsec request tree, */ /* if policy == IPSEC else this value == NULL.*/ /* * lifetime handler. * the policy can be used without limitiation if both lifetime and * validtime are zero. * "lifetime" is passed by sadb_lifetime.sadb_lifetime_addtime. * "validtime" is passed by sadb_lifetime.sadb_lifetime_usetime. */ long created; /* time created the policy */ long lastused; /* updated every when kernel sends a packet */ long lifetime; /* duration of the lifetime of this policy */ long validtime; /* duration this policy is valid without use */ }; /* Request for IPsec */ struct ipsecrequest { struct ipsecrequest *next; /* pointer to next structure */ /* If NULL, it means the end of chain. */ struct secasindex saidx;/* hint for search proper SA */ /* if __ss_len == 0 then no address specified.*/ u_int level; /* IPsec level defined below. */ struct secasvar *sav; /* place holder of SA for use */ struct secpolicy *sp; /* back pointer to SP */ }; /* security policy in PCB */ struct inpcbpolicy { struct secpolicy *sp_in; struct secpolicy *sp_out; int priv; /* privileged socket ? */ }; /* SP acquiring list table. */ struct secspacq { LIST_ENTRY(secspacq) chain; struct secpolicyindex spidx; long created; /* for lifetime */ int count; /* for lifetime */ /* XXX: here is mbuf place holder to be sent ? */ }; #endif /* _KERNEL */ /* according to IANA assignment, port 0x0000 and proto 0xff are reserved. */ #define IPSEC_PORT_ANY 0 #define IPSEC_ULPROTO_ANY 255 #define IPSEC_PROTO_ANY 255 /* mode of security protocol */ /* NOTE: DON'T use IPSEC_MODE_ANY at SPD. It's only use in SAD */ #define IPSEC_MODE_ANY 0 /* i.e. wildcard. */ #define IPSEC_MODE_TRANSPORT 1 #define IPSEC_MODE_TUNNEL 2 /* * Direction of security policy. * NOTE: Since INVALID is used just as flag. * The other are used for loop counter too. */ #define IPSEC_DIR_ANY 0 #define IPSEC_DIR_INBOUND 1 #define IPSEC_DIR_OUTBOUND 2 #define IPSEC_DIR_MAX 3 #define IPSEC_DIR_INVALID 4 /* Policy level */ /* * IPSEC, ENTRUST and BYPASS are allowed for setsockopt() in PCB, * DISCARD, IPSEC and NONE are allowed for setkey() in SPD. * DISCARD and NONE are allowed for system default. */ #define IPSEC_POLICY_DISCARD 0 /* discarding packet */ #define IPSEC_POLICY_NONE 1 /* through IPsec engine */ #define IPSEC_POLICY_IPSEC 2 /* do IPsec */ #define IPSEC_POLICY_ENTRUST 3 /* consulting SPD if present. */ #define IPSEC_POLICY_BYPASS 4 /* only for privileged socket. */ /* Security protocol level */ #define IPSEC_LEVEL_DEFAULT 0 /* reference to system default */ #define IPSEC_LEVEL_USE 1 /* use SA if present. */ #define IPSEC_LEVEL_REQUIRE 2 /* require SA. */ #define IPSEC_LEVEL_UNIQUE 3 /* unique SA. */ #define IPSEC_MANUAL_REQID_MAX 0x3fff /* * if security policy level == unique, this id * indicate to a relative SA for use, else is * zero. * 1 - 0x3fff are reserved for manual keying. * 0 are reserved for above reason. Others is * for kernel use. * Note that this id doesn't identify SA * by only itself. */ #define IPSEC_REPLAYWSIZE 32 /* statistics for ipsec processing */ struct ipsecstat { u_quad_t in_success; /* succeeded inbound process */ u_quad_t in_polvio; /* security policy violation for inbound process */ u_quad_t in_nosa; /* inbound SA is unavailable */ u_quad_t in_inval; /* inbound processing failed due to EINVAL */ u_quad_t in_nomem; /* inbound processing failed due to ENOBUFS */ u_quad_t in_badspi; /* failed getting a SPI */ u_quad_t in_ahreplay; /* AH replay check failed */ u_quad_t in_espreplay; /* ESP replay check failed */ u_quad_t in_ahauthsucc; /* AH authentication success */ u_quad_t in_ahauthfail; /* AH authentication failure */ u_quad_t in_espauthsucc; /* ESP authentication success */ u_quad_t in_espauthfail; /* ESP authentication failure */ u_quad_t in_esphist[256]; u_quad_t in_ahhist[256]; u_quad_t in_comphist[256]; u_quad_t out_success; /* succeeded outbound process */ u_quad_t out_polvio; /* security policy violation for outbound process */ u_quad_t out_nosa; /* outbound SA is unavailable */ u_quad_t out_inval; /* outbound process failed due to EINVAL */ u_quad_t out_nomem; /* inbound processing failed due to ENOBUFS */ u_quad_t out_noroute; /* there is no route */ u_quad_t out_esphist[256]; u_quad_t out_ahhist[256]; u_quad_t out_comphist[256]; }; /* * Definitions for IPsec & Key sysctl operations. */ /* * Names for IPsec & Key sysctl objects */ #define IPSECCTL_STATS 1 /* stats */ #define IPSECCTL_DEF_POLICY 2 #define IPSECCTL_DEF_ESP_TRANSLEV 3 /* int; ESP transport mode */ #define IPSECCTL_DEF_ESP_NETLEV 4 /* int; ESP tunnel mode */ #define IPSECCTL_DEF_AH_TRANSLEV 5 /* int; AH transport mode */ #define IPSECCTL_DEF_AH_NETLEV 6 /* int; AH tunnel mode */ #if 0 /* obsolete, do not reuse */ #define IPSECCTL_INBOUND_CALL_IKE 7 #endif #define IPSECCTL_AH_CLEARTOS 8 #define IPSECCTL_AH_OFFSETMASK 9 #define IPSECCTL_DFBIT 10 #define IPSECCTL_ECN 11 #define IPSECCTL_DEBUG 12 #define IPSECCTL_ESP_RANDPAD 13 #define IPSECCTL_MAXID 14 #define IPSECCTL_NAMES { \ { 0, 0 }, \ { 0, 0 }, \ { "def_policy", CTLTYPE_INT }, \ { "esp_trans_deflev", CTLTYPE_INT }, \ { "esp_net_deflev", CTLTYPE_INT }, \ { "ah_trans_deflev", CTLTYPE_INT }, \ { "ah_net_deflev", CTLTYPE_INT }, \ { 0, 0 }, \ { "ah_cleartos", CTLTYPE_INT }, \ { "ah_offsetmask", CTLTYPE_INT }, \ { "dfbit", CTLTYPE_INT }, \ { "ecn", CTLTYPE_INT }, \ { "debug", CTLTYPE_INT }, \ { "esp_randpad", CTLTYPE_INT }, \ } #define IPSEC6CTL_NAMES { \ { 0, 0 }, \ { 0, 0 }, \ { "def_policy", CTLTYPE_INT }, \ { "esp_trans_deflev", CTLTYPE_INT }, \ { "esp_net_deflev", CTLTYPE_INT }, \ { "ah_trans_deflev", CTLTYPE_INT }, \ { "ah_net_deflev", CTLTYPE_INT }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { "ecn", CTLTYPE_INT }, \ { "debug", CTLTYPE_INT }, \ { "esp_randpad", CTLTYPE_INT }, \ } #ifdef _KERNEL struct ipsec_output_state { struct mbuf *m; struct route *ro; struct sockaddr *dst; }; struct ipsec_history { int ih_proto; u_int32_t ih_spi; }; extern int ipsec_debug; extern struct ipsecstat ipsecstat; extern struct secpolicy ip4_def_policy; extern int ip4_esp_trans_deflev; extern int ip4_esp_net_deflev; extern int ip4_ah_trans_deflev; extern int ip4_ah_net_deflev; extern int ip4_ah_cleartos; extern int ip4_ah_offsetmask; extern int ip4_ipsec_dfbit; extern int ip4_ipsec_ecn; extern int ip4_esp_randpad; #define ipseclog(x) do { if (ipsec_debug) log x; } while (0) extern struct secpolicy *ipsec4_getpolicybysock __P((struct mbuf *, u_int, struct socket *, int *)); extern struct secpolicy *ipsec4_getpolicybyaddr __P((struct mbuf *, u_int, int, int *)); struct inpcb; extern int ipsec_init_policy __P((struct socket *so, struct inpcbpolicy **)); extern int ipsec_copy_policy __P((struct inpcbpolicy *, struct inpcbpolicy *)); extern u_int ipsec_get_reqlevel __P((struct ipsecrequest *)); extern int ipsec4_set_policy __P((struct inpcb *inp, int optname, caddr_t request, size_t len, int priv)); extern int ipsec4_get_policy __P((struct inpcb *inpcb, caddr_t request, size_t len, struct mbuf **mp)); extern int ipsec4_delete_pcbpolicy __P((struct inpcb *)); extern int ipsec4_in_reject_so __P((struct mbuf *, struct socket *)); extern int ipsec4_in_reject __P((struct mbuf *, struct inpcb *)); struct secas; struct tcpcb; extern int ipsec_chkreplay __P((u_int32_t, struct secasvar *)); extern int ipsec_updatereplay __P((u_int32_t, struct secasvar *)); extern size_t ipsec4_hdrsiz __P((struct mbuf *, u_int, struct inpcb *)); extern size_t ipsec_hdrsiz_tcp __P((struct tcpcb *)); struct ip; extern const char *ipsec4_logpacketstr __P((struct ip *, u_int32_t)); extern const char *ipsec_logsastr __P((struct secasvar *)); extern void ipsec_dumpmbuf __P((struct mbuf *)); extern int ipsec4_output __P((struct ipsec_output_state *, struct secpolicy *, int)); extern int ipsec4_tunnel_validate __P((struct mbuf *, int, u_int, struct secasvar *)); extern struct mbuf *ipsec_copypkt __P((struct mbuf *)); extern void ipsec_delaux __P((struct mbuf *)); -extern int ipsec_setsocket __P((struct mbuf *, struct socket *)); -extern struct socket *ipsec_getsocket __P((struct mbuf *)); extern int ipsec_addhist __P((struct mbuf *, int, u_int32_t)); extern struct ipsec_history *ipsec_gethist __P((struct mbuf *, int *)); -extern void ipsec_clearhist __P((struct mbuf *)); #endif /* _KERNEL */ #ifndef _KERNEL extern caddr_t ipsec_set_policy __P((char *, int)); extern int ipsec_get_policylen __P((caddr_t)); extern char *ipsec_dump_policy __P((caddr_t, char *)); extern const char *ipsec_strerror __P((void)); #endif /* !_KERNEL */ #endif /* _NETINET6_IPSEC_H_ */ Index: head/sys/netinet6/mld6.c =================================================================== --- head/sys/netinet6/mld6.c (revision 105193) +++ head/sys/netinet6/mld6.c (revision 105194) @@ -1,474 +1,474 @@ /* $FreeBSD$ */ /* $KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $ */ /* * Copyright (C) 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Protocol constants */ /* denotes that the MLD max response delay field specifies time in milliseconds */ #define MLD6_TIMER_SCALE 1000 /* * time between repetitions of a node's initial report of interest in a * multicast address(in seconds) */ #define MLD6_UNSOLICITED_REPORT_INTERVAL 10 static struct ip6_pktopts ip6_opts; static int mld6_timers_are_running; /* XXX: These are necessary for KAME's link-local hack */ static struct in6_addr mld6_all_nodes_linklocal = IN6ADDR_LINKLOCAL_ALLNODES_INIT; static struct in6_addr mld6_all_routers_linklocal = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; static void mld6_sendpkt __P((struct in6_multi *, int, const struct in6_addr *)); void mld6_init() { static u_int8_t hbh_buf[8]; struct ip6_hbh *hbh = (struct ip6_hbh *)hbh_buf; u_int16_t rtalert_code = htons((u_int16_t)IP6OPT_RTALERT_MLD); mld6_timers_are_running = 0; /* ip6h_nxt will be fill in later */ hbh->ip6h_len = 0; /* (8 >> 3) - 1 */ /* XXX: grotty hard coding... */ hbh_buf[2] = IP6OPT_PADN; /* 2 byte padding */ hbh_buf[3] = 0; hbh_buf[4] = IP6OPT_RTALERT; hbh_buf[5] = IP6OPT_RTALERT_LEN - 2; bcopy((caddr_t)&rtalert_code, &hbh_buf[6], sizeof(u_int16_t)); init_ip6pktopts(&ip6_opts); ip6_opts.ip6po_hbh = hbh; } void mld6_start_listening(in6m) struct in6_multi *in6m; { int s = splnet(); /* * RFC2710 page 10: * The node never sends a Report or Done for the link-scope all-nodes * address. * MLD messages are never sent for multicast addresses whose scope is 0 * (reserved) or 1 (node-local). */ mld6_all_nodes_linklocal.s6_addr16[1] = htons(in6m->in6m_ifp->if_index); /* XXX */ if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &mld6_all_nodes_linklocal) || IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) { in6m->in6m_timer = 0; in6m->in6m_state = MLD6_OTHERLISTENER; } else { mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); in6m->in6m_timer = MLD6_RANDOM_DELAY( MLD6_UNSOLICITED_REPORT_INTERVAL * PR_FASTHZ); in6m->in6m_state = MLD6_IREPORTEDLAST; mld6_timers_are_running = 1; } splx(s); } void mld6_stop_listening(in6m) struct in6_multi *in6m; { mld6_all_nodes_linklocal.s6_addr16[1] = htons(in6m->in6m_ifp->if_index); /* XXX */ mld6_all_routers_linklocal.s6_addr16[1] = htons(in6m->in6m_ifp->if_index); /* XXX: necessary when mrouting */ if (in6m->in6m_state == MLD6_IREPORTEDLAST && (!IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &mld6_all_nodes_linklocal)) && IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) > IPV6_ADDR_SCOPE_NODELOCAL) mld6_sendpkt(in6m, MLD_LISTENER_DONE, &mld6_all_routers_linklocal); } void mld6_input(m, off) struct mbuf *m; int off; { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct mld_hdr *mldh; struct ifnet *ifp = m->m_pkthdr.rcvif; struct in6_multi *in6m; struct in6_ifaddr *ia; struct ifmultiaddr *ifma; int timer; /* timer value in the MLD query header */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(*mldh),); mldh = (struct mld_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(mldh, struct mld_hdr *, m, off, sizeof(*mldh)); if (mldh == NULL) { icmp6stat.icp6s_tooshort++; return; } #endif /* source address validation */ ip6 = mtod(m, struct ip6_hdr *);/* in case mpullup */ if (!IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) { log(LOG_ERR, "mld6_input: src %s is not link-local (grp=%s)\n", ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&mldh->mld_addr)); /* * spec (RFC2710) does not explicitly * specify to discard the packet from a non link-local * source address. But we believe it's expected to do so. * XXX: do we have to allow :: as source? */ m_freem(m); return; } /* * In the MLD6 specification, there are 3 states and a flag. * * In Non-Listener state, we simply don't have a membership record. * In Delaying Listener state, our timer is running (in6m->in6m_timer) * In Idle Listener state, our timer is not running (in6m->in6m_timer==0) * * The flag is in6m->in6m_state, it is set to MLD6_OTHERLISTENER if * we have heard a report from another member, or MLD6_IREPORTEDLAST * if we sent the last report. */ switch(mldh->mld_type) { case MLD_LISTENER_QUERY: if (ifp->if_flags & IFF_LOOPBACK) break; if (!IN6_IS_ADDR_UNSPECIFIED(&mldh->mld_addr) && !IN6_IS_ADDR_MULTICAST(&mldh->mld_addr)) break; /* print error or log stat? */ if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr)) mldh->mld_addr.s6_addr16[1] = htons(ifp->if_index); /* XXX */ /* * - Start the timers in all of our membership records * that the query applies to for the interface on * which the query arrived excl. those that belong * to the "all-nodes" group (ff02::1). * - Restart any timer that is already running but has * A value longer than the requested timeout. * - Use the value specified in the query message as * the maximum timeout. */ IFP_TO_IA6(ifp, ia); if (ia == NULL) break; /* * XXX: System timer resolution is too low to handle Max * Response Delay, so set 1 to the internal timer even if * the calculated value equals to zero when Max Response * Delay is positive. */ timer = ntohs(mldh->mld_maxdelay)*PR_FASTHZ/MLD6_TIMER_SCALE; if (timer == 0 && mldh->mld_maxdelay) timer = 1; mld6_all_nodes_linklocal.s6_addr16[1] = htons(ifp->if_index); /* XXX */ TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6) continue; in6m = (struct in6_multi *)ifma->ifma_protospec; if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &mld6_all_nodes_linklocal) || IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) continue; if (IN6_IS_ADDR_UNSPECIFIED(&mldh->mld_addr) || IN6_ARE_ADDR_EQUAL(&mldh->mld_addr, &in6m->in6m_addr)) { if (timer == 0) { /* send a report immediately */ mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); in6m->in6m_timer = 0; /* reset timer */ in6m->in6m_state = MLD6_IREPORTEDLAST; } else if (in6m->in6m_timer == 0 || /*idle state*/ in6m->in6m_timer > timer) { in6m->in6m_timer = MLD6_RANDOM_DELAY(timer); mld6_timers_are_running = 1; } } } if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr)) mldh->mld_addr.s6_addr16[1] = 0; /* XXX */ break; case MLD_LISTENER_REPORT: /* * For fast leave to work, we have to know that we are the * last person to send a report for this group. Reports * can potentially get looped back if we are a multicast * router, so discard reports sourced by me. * Note that it is impossible to check IFF_LOOPBACK flag of * ifp for this purpose, since ip6_mloopback pass the physical * interface to looutput. */ if (m->m_flags & M_LOOP) /* XXX: grotty flag, but efficient */ break; if (!IN6_IS_ADDR_MULTICAST(&mldh->mld_addr)) break; if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr)) mldh->mld_addr.s6_addr16[1] = htons(ifp->if_index); /* XXX */ /* * If we belong to the group being reported, stop * our timer for that group. */ IN6_LOOKUP_MULTI(mldh->mld_addr, ifp, in6m); if (in6m) { in6m->in6m_timer = 0; /* transit to idle state */ in6m->in6m_state = MLD6_OTHERLISTENER; /* clear flag */ } if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr)) mldh->mld_addr.s6_addr16[1] = 0; /* XXX */ break; default: /* this is impossible */ log(LOG_ERR, "mld6_input: illegal type(%d)", mldh->mld_type); break; } m_freem(m); } void mld6_fasttimeo() { struct in6_multi *in6m; struct in6_multistep step; int s; /* * Quick check to see if any work needs to be done, in order * to minimize the overhead of fasttimo processing. */ if (!mld6_timers_are_running) return; s = splnet(); mld6_timers_are_running = 0; IN6_FIRST_MULTI(step, in6m); while (in6m != NULL) { if (in6m->in6m_timer == 0) { /* do nothing */ } else if (--in6m->in6m_timer == 0) { mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); in6m->in6m_state = MLD6_IREPORTEDLAST; } else { mld6_timers_are_running = 1; } IN6_NEXT_MULTI(step, in6m); } splx(s); } static void mld6_sendpkt(in6m, type, dst) struct in6_multi *in6m; int type; const struct in6_addr *dst; { struct mbuf *mh, *md; struct mld_hdr *mldh; struct ip6_hdr *ip6; struct ip6_moptions im6o; struct in6_ifaddr *ia; struct ifnet *ifp = in6m->in6m_ifp; struct ifnet *outif = NULL; /* * At first, find a link local address on the outgoing interface * to use as the source address of the MLD packet. */ if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST)) == NULL) return; /* * Allocate mbufs to store ip6 header and MLD header. * We allocate 2 mbufs and make chain in advance because * it is more convenient when inserting the hop-by-hop option later. */ MGETHDR(mh, M_DONTWAIT, MT_HEADER); if (mh == NULL) return; MGET(md, M_DONTWAIT, MT_DATA); if (md == NULL) { m_free(mh); return; } mh->m_next = md; mh->m_pkthdr.rcvif = NULL; mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr); mh->m_len = sizeof(struct ip6_hdr); MH_ALIGN(mh, sizeof(struct ip6_hdr)); /* fill in the ip6 header */ ip6 = mtod(mh, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; /* ip6_hlim will be set by im6o.im6o_multicast_hlim */ ip6->ip6_src = ia->ia_addr.sin6_addr; ip6->ip6_dst = dst ? *dst : in6m->in6m_addr; /* fill in the MLD header */ md->m_len = sizeof(struct mld_hdr); mldh = mtod(md, struct mld_hdr *); mldh->mld_type = type; mldh->mld_code = 0; mldh->mld_cksum = 0; /* XXX: we assume the function will not be called for query messages */ mldh->mld_maxdelay = 0; mldh->mld_reserved = 0; mldh->mld_addr = in6m->in6m_addr; if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld_addr)) mldh->mld_addr.s6_addr16[1] = 0; /* XXX */ mldh->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), sizeof(struct mld_hdr)); /* construct multicast option */ bzero(&im6o, sizeof(im6o)); im6o.im6o_multicast_ifp = ifp; im6o.im6o_multicast_hlim = 1; /* * Request loopback of the report if we are acting as a multicast * router, so that the process-level routing daemon can hear it. */ im6o.im6o_multicast_loop = (ip6_mrouter != NULL); /* increment output statictics */ icmp6stat.icp6s_outhist[type]++; - ip6_output(mh, &ip6_opts, NULL, 0, &im6o, &outif); + ip6_output(mh, &ip6_opts, NULL, 0, &im6o, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); switch (type) { case MLD_LISTENER_QUERY: icmp6_ifstat_inc(outif, ifs6_out_mldquery); break; case MLD_LISTENER_REPORT: icmp6_ifstat_inc(outif, ifs6_out_mldreport); break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(outif, ifs6_out_mlddone); break; } } } Index: head/sys/netinet6/nd6_nbr.c =================================================================== --- head/sys/netinet6/nd6_nbr.c (revision 105193) +++ head/sys/netinet6/nd6_nbr.c (revision 105194) @@ -1,1404 +1,1396 @@ /* $FreeBSD$ */ /* $KAME: nd6_nbr.c,v 1.86 2002/01/21 02:33:04 jinmei Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #ifdef INET6 #include #endif #endif #include #define SDL(s) ((struct sockaddr_dl *)s) struct dadq; static struct dadq *nd6_dad_find __P((struct ifaddr *)); static void nd6_dad_starttimer __P((struct dadq *, int)); static void nd6_dad_stoptimer __P((struct dadq *)); static void nd6_dad_timer __P((struct ifaddr *)); static void nd6_dad_ns_output __P((struct dadq *, struct ifaddr *)); static void nd6_dad_ns_input __P((struct ifaddr *)); static void nd6_dad_na_input __P((struct ifaddr *)); static int dad_ignore_ns = 0; /* ignore NS in DAD - specwise incorrect*/ static int dad_maxtry = 15; /* max # of *tries* to transmit DAD packet */ /* * Input an Neighbor Solicitation Message. * * Based on RFC 2461 * Based on RFC 2462 (duplicated address detection) */ void nd6_ns_input(m, off, icmp6len) struct mbuf *m; int off, icmp6len; { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_neighbor_solicit *nd_ns; struct in6_addr saddr6 = ip6->ip6_src; struct in6_addr daddr6 = ip6->ip6_dst; struct in6_addr taddr6; struct in6_addr myaddr6; char *lladdr = NULL; struct ifaddr *ifa; int lladdrlen = 0; int anycast = 0, proxy = 0, tentative = 0; int tlladdr; union nd_opts ndopts; struct sockaddr_dl *proxydl = NULL; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_ns, struct nd_neighbor_solicit *, m, off, icmp6len); if (nd_ns == NULL) { icmp6stat.icp6s_tooshort++; return; } #endif ip6 = mtod(m, struct ip6_hdr *); /* adjust pointer for safety */ taddr6 = nd_ns->nd_ns_target; if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst), if_name(ifp))); goto bad; } if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { /* dst has to be solicited node multicast address. */ if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL /* don't check ifindex portion */ && daddr6.s6_addr32[1] == 0 && daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE && daddr6.s6_addr8[12] == 0xff) { ; /* good */ } else { nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " "(wrong ip6 dst)\n")); goto bad; } } if (IN6_IS_ADDR_MULTICAST(&taddr6)) { nd6log((LOG_INFO, "nd6_ns_input: bad NS target (multicast)\n")); goto bad; } if (IN6_IS_SCOPE_LINKLOCAL(&taddr6)) taddr6.s6_addr16[1] = htons(ifp->if_index); icmp6len -= sizeof(*nd_ns); nd6_option_init(nd_ns + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_ns_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && lladdr) { nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " "(link-layer address option)\n")); goto bad; } /* * Attaching target link-layer address to the NA? * (RFC 2461 7.2.4) * * NS IP dst is unicast/anycast MUST NOT add * NS IP dst is solicited-node multicast MUST add * * In implementation, we add target link-layer address by default. * We do not add one in MUST NOT cases. */ #if 0 /* too much! */ ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &daddr6); if (ifa && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST)) tlladdr = 0; else #endif if (!IN6_IS_ADDR_MULTICAST(&daddr6)) tlladdr = 0; else tlladdr = 1; /* * Target address (taddr6) must be either: * (1) Valid unicast/anycast address for my receiving interface, * (2) Unicast address for which I'm offering proxy service, or * (3) "tentative" address on which DAD is being performed. */ /* (1) and (3) check. */ ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); /* (2) check. */ if (!ifa) { struct rtentry *rt; struct sockaddr_in6 tsin6; bzero(&tsin6, sizeof tsin6); tsin6.sin6_len = sizeof(struct sockaddr_in6); tsin6.sin6_family = AF_INET6; tsin6.sin6_addr = taddr6; rt = rtalloc1((struct sockaddr *)&tsin6, 0, 0); if (rt && (rt->rt_flags & RTF_ANNOUNCE) != 0 && rt->rt_gateway->sa_family == AF_LINK) { /* * proxy NDP for single entry */ ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); if (ifa) { proxy = 1; proxydl = SDL(rt->rt_gateway); } } if (rt) rtfree(rt); } if (!ifa) { /* * We've got an NS packet, and we don't have that adddress * assigned for us. We MUST silently ignore it. * See RFC2461 7.2.3. */ goto freeit; } myaddr6 = *IFA_IN6(ifa); anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST; tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED) goto freeit; if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_ns_input: lladdrlen mismatch for %s " "(if %d, NS packet %d)\n", ip6_sprintf(&taddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) { nd6log((LOG_INFO, "nd6_ns_input: duplicate IP6 address %s\n", ip6_sprintf(&saddr6))); goto freeit; } /* * We have neighbor solicitation packet, with target address equals to * one of my tentative address. * * src addr how to process? * --- --- * multicast of course, invalid (rejected in ip6_input) * unicast somebody is doing address resolution -> ignore * unspec dup address detection * * The processing is defined in RFC 2462. */ if (tentative) { /* * If source address is unspecified address, it is for * duplicated address detection. * * If not, the packet is for addess resolution; * silently ignore it. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) nd6_dad_ns_input(ifa); goto freeit; } /* * If the source address is unspecified address, entries must not * be created or updated. * It looks that sender is performing DAD. Output NA toward * all-node multicast address, to tell the sender that I'm using * the address. * S bit ("solicited") must be zero. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { saddr6 = in6addr_linklocal_allnodes; saddr6.s6_addr16[1] = htons(ifp->if_index); nd6_na_output(ifp, &saddr6, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | (ip6_forwarding ? ND_NA_FLAG_ROUTER : 0), tlladdr, (struct sockaddr *)proxydl); goto freeit; } nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_NEIGHBOR_SOLICIT, 0); nd6_na_output(ifp, &saddr6, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | (ip6_forwarding ? ND_NA_FLAG_ROUTER : 0) | ND_NA_FLAG_SOLICITED, tlladdr, (struct sockaddr *)proxydl); freeit: m_freem(m); return; bad: nd6log((LOG_ERR, "nd6_ns_input: src=%s\n", ip6_sprintf(&saddr6))); nd6log((LOG_ERR, "nd6_ns_input: dst=%s\n", ip6_sprintf(&daddr6))); nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n", ip6_sprintf(&taddr6))); icmp6stat.icp6s_badns++; m_freem(m); } /* * Output an Neighbor Solicitation Message. Caller specifies: * - ICMP6 header source IP6 address * - ND6 header target IP6 address * - ND6 header source datalink address * * Based on RFC 2461 * Based on RFC 2462 (duplicated address detection) */ void nd6_ns_output(ifp, daddr6, taddr6, ln, dad) struct ifnet *ifp; const struct in6_addr *daddr6, *taddr6; struct llinfo_nd6 *ln; /* for source address determination */ int dad; /* duplicated address detection */ { struct mbuf *m; struct ip6_hdr *ip6; struct nd_neighbor_solicit *nd_ns; struct in6_ifaddr *ia = NULL; struct ip6_moptions im6o; int icmp6len; int maxlen; caddr_t mac; struct ifnet *outif = NULL; if (IN6_IS_ADDR_MULTICAST(taddr6)) return; /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_ns); maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7; if (max_linkhdr + maxlen >= MCLBYTES) { #ifdef DIAGNOSTIC printf("nd6_ns_output: max_linkhdr + maxlen >= MCLBYTES " "(%d + %d > %d)\n", max_linkhdr, maxlen, MCLBYTES); #endif return; } MGETHDR(m, M_DONTWAIT, MT_DATA); if (m && max_linkhdr + maxlen >= MHLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); m = NULL; } } if (m == NULL) return; m->m_pkthdr.rcvif = NULL; if (daddr6 == NULL || IN6_IS_ADDR_MULTICAST(daddr6)) { m->m_flags |= M_MCAST; im6o.im6o_multicast_ifp = ifp; im6o.im6o_multicast_hlim = 255; im6o.im6o_multicast_loop = 0; } icmp6len = sizeof(*nd_ns); m->m_pkthdr.len = m->m_len = sizeof(*ip6) + icmp6len; m->m_data += max_linkhdr; /* or MH_ALIGN() equivalent? */ /* fill neighbor solicitation packet */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; if (daddr6) ip6->ip6_dst = *daddr6; else { ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL; ip6->ip6_dst.s6_addr16[1] = htons(ifp->if_index); ip6->ip6_dst.s6_addr32[1] = 0; ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_ONE; ip6->ip6_dst.s6_addr32[3] = taddr6->s6_addr32[3]; ip6->ip6_dst.s6_addr8[12] = 0xff; } if (!dad) { #if 0 /* KAME way, exact address scope match */ /* * Select a source whose scope is the same as that of the dest. * Typically, the dest is link-local solicitation multicast * (i.e. neighbor discovery) or link-local/global unicast * (i.e. neighbor un-reachability detection). */ ia = in6_ifawithifp(ifp, &ip6->ip6_dst); if (ia == NULL) { m_freem(m); return; } ip6->ip6_src = ia->ia_addr.sin6_addr; #else /* spec-wise correct */ /* * RFC2461 7.2.2: * "If the source address of the packet prompting the * solicitation is the same as one of the addresses assigned * to the outgoing interface, that address SHOULD be placed * in the IP Source Address of the outgoing solicitation. * Otherwise, any one of the addresses assigned to the * interface should be used." * * We use the source address for the prompting packet * (saddr6), if: * - saddr6 is given from the caller (by giving "ln"), and * - saddr6 belongs to the outgoing interface. * Otherwise, we perform a scope-wise match. */ struct ip6_hdr *hip6; /* hold ip6 */ struct in6_addr *saddr6; if (ln && ln->ln_hold) { hip6 = mtod(ln->ln_hold, struct ip6_hdr *); /* XXX pullup? */ if (sizeof(*hip6) < ln->ln_hold->m_len) saddr6 = &hip6->ip6_src; else saddr6 = NULL; } else saddr6 = NULL; if (saddr6 && in6ifa_ifpwithaddr(ifp, saddr6)) bcopy(saddr6, &ip6->ip6_src, sizeof(*saddr6)); else { ia = in6_ifawithifp(ifp, &ip6->ip6_dst); if (ia == NULL) { m_freem(m); return; } ip6->ip6_src = ia->ia_addr.sin6_addr; } #endif } else { /* * Source address for DAD packet must always be IPv6 * unspecified address. (0::0) */ bzero(&ip6->ip6_src, sizeof(ip6->ip6_src)); } nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1); nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT; nd_ns->nd_ns_code = 0; nd_ns->nd_ns_reserved = 0; nd_ns->nd_ns_target = *taddr6; if (IN6_IS_SCOPE_LINKLOCAL(&nd_ns->nd_ns_target)) nd_ns->nd_ns_target.s6_addr16[1] = 0; /* * Add source link-layer address option. * * spec implementation * --- --- * DAD packet MUST NOT do not add the option * there's no link layer address: * impossible do not add the option * there's link layer address: * Multicast NS MUST add one add the option * Unicast NS SHOULD add one add the option */ if (!dad && (mac = nd6_ifptomac(ifp))) { int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1); /* 8 byte alignments... */ optlen = (optlen + 7) & ~7; m->m_pkthdr.len += optlen; m->m_len += optlen; icmp6len += optlen; bzero((caddr_t)nd_opt, optlen); nd_opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; nd_opt->nd_opt_len = optlen >> 3; bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen); } ip6->ip6_plen = htons((u_short)icmp6len); nd_ns->nd_ns_cksum = 0; nd_ns->nd_ns_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), icmp6len); -#ifdef IPSEC - /* Don't lookup socket */ - (void)ipsec_setsocket(m, NULL); -#endif - ip6_output(m, NULL, NULL, dad ? IPV6_DADOUTPUT : 0, &im6o, &outif); + ip6_output(m, NULL, NULL, dad ? IPV6_DADOUTPUT : 0, &im6o, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_neighborsolicit); } icmp6stat.icp6s_outhist[ND_NEIGHBOR_SOLICIT]++; } /* * Neighbor advertisement input handling. * * Based on RFC 2461 * Based on RFC 2462 (duplicated address detection) * * the following items are not implemented yet: * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD) * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD) */ void nd6_na_input(m, off, icmp6len) struct mbuf *m; int off, icmp6len; { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_neighbor_advert *nd_na; #if 0 struct in6_addr saddr6 = ip6->ip6_src; #endif struct in6_addr daddr6 = ip6->ip6_dst; struct in6_addr taddr6; int flags; int is_router; int is_solicited; int is_override; char *lladdr = NULL; int lladdrlen = 0; struct ifaddr *ifa; struct llinfo_nd6 *ln; struct rtentry *rt; struct sockaddr_dl *sdl; union nd_opts ndopts; if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_na_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst), if_name(ifp))); goto bad; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_na = (struct nd_neighbor_advert *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_na, struct nd_neighbor_advert *, m, off, icmp6len); if (nd_na == NULL) { icmp6stat.icp6s_tooshort++; return; } #endif taddr6 = nd_na->nd_na_target; flags = nd_na->nd_na_flags_reserved; is_router = ((flags & ND_NA_FLAG_ROUTER) != 0); is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0); is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0); if (IN6_IS_SCOPE_LINKLOCAL(&taddr6)) taddr6.s6_addr16[1] = htons(ifp->if_index); if (IN6_IS_ADDR_MULTICAST(&taddr6)) { nd6log((LOG_ERR, "nd6_na_input: invalid target address %s\n", ip6_sprintf(&taddr6))); goto bad; } if (IN6_IS_ADDR_MULTICAST(&daddr6)) if (is_solicited) { nd6log((LOG_ERR, "nd6_na_input: a solicited adv is multicasted\n")); goto bad; } icmp6len -= sizeof(*nd_na); nd6_option_init(nd_na + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_na_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); /* * Target address matches one of my interface address. * * If my address is tentative, this means that there's somebody * already using the same address as mine. This indicates DAD failure. * This is defined in RFC 2462. * * Otherwise, process as defined in RFC 2461. */ if (ifa && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE)) { nd6_dad_na_input(ifa); goto freeit; } /* Just for safety, maybe unnecessary. */ if (ifa) { log(LOG_ERR, "nd6_na_input: duplicate IP6 address %s\n", ip6_sprintf(&taddr6)); goto freeit; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_na_input: lladdrlen mismatch for %s " "(if %d, NA packet %d)\n", ip6_sprintf(&taddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } /* * If no neighbor cache entry is found, NA SHOULD silently be discarded. */ rt = nd6_lookup(&taddr6, 0, ifp); if ((rt == NULL) || ((ln = (struct llinfo_nd6 *)rt->rt_llinfo) == NULL) || ((sdl = SDL(rt->rt_gateway)) == NULL)) goto freeit; if (ln->ln_state == ND6_LLINFO_INCOMPLETE) { /* * If the link-layer has address, and no lladdr option came, * discard the packet. */ if (ifp->if_addrlen && !lladdr) goto freeit; /* * Record link-layer address, and update the state. */ sdl->sdl_alen = ifp->if_addrlen; bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen); if (is_solicited) { ln->ln_state = ND6_LLINFO_REACHABLE; ln->ln_byhint = 0; if (ln->ln_expire) ln->ln_expire = time_second + nd_ifinfo[rt->rt_ifp->if_index].reachable; } else { ln->ln_state = ND6_LLINFO_STALE; ln->ln_expire = time_second + nd6_gctimer; } if ((ln->ln_router = is_router) != 0) { /* * This means a router's state has changed from * non-reachable to probably reachable, and might * affect the status of associated prefixes.. */ pfxlist_onlink_check(); } } else { int llchange; /* * Check if the link-layer address has changed or not. */ if (!lladdr) llchange = 0; else { if (sdl->sdl_alen) { if (bcmp(lladdr, LLADDR(sdl), ifp->if_addrlen)) llchange = 1; else llchange = 0; } else llchange = 1; } /* * This is VERY complex. Look at it with care. * * override solicit lladdr llchange action * (L: record lladdr) * * 0 0 n -- (2c) * 0 0 y n (2b) L * 0 0 y y (1) REACHABLE->STALE * 0 1 n -- (2c) *->REACHABLE * 0 1 y n (2b) L *->REACHABLE * 0 1 y y (1) REACHABLE->STALE * 1 0 n -- (2a) * 1 0 y n (2a) L * 1 0 y y (2a) L *->STALE * 1 1 n -- (2a) *->REACHABLE * 1 1 y n (2a) L *->REACHABLE * 1 1 y y (2a) L *->REACHABLE */ if (!is_override && (lladdr && llchange)) { /* (1) */ /* * If state is REACHABLE, make it STALE. * no other updates should be done. */ if (ln->ln_state == ND6_LLINFO_REACHABLE) { ln->ln_state = ND6_LLINFO_STALE; ln->ln_expire = time_second + nd6_gctimer; } goto freeit; } else if (is_override /* (2a) */ || (!is_override && (lladdr && !llchange)) /* (2b) */ || !lladdr) { /* (2c) */ /* * Update link-local address, if any. */ if (lladdr) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen); } /* * If solicited, make the state REACHABLE. * If not solicited and the link-layer address was * changed, make it STALE. */ if (is_solicited) { ln->ln_state = ND6_LLINFO_REACHABLE; ln->ln_byhint = 0; if (ln->ln_expire) { ln->ln_expire = time_second + nd_ifinfo[ifp->if_index].reachable; } } else { if (lladdr && llchange) { ln->ln_state = ND6_LLINFO_STALE; ln->ln_expire = time_second + nd6_gctimer; } } } if (ln->ln_router && !is_router) { /* * The peer dropped the router flag. * Remove the sender from the Default Router List and * update the Destination Cache entries. */ struct nd_defrouter *dr; struct in6_addr *in6; int s; in6 = &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr; /* * Lock to protect the default router list. * XXX: this might be unnecessary, since this function * is only called under the network software interrupt * context. However, we keep it just for safety. */ s = splnet(); dr = defrouter_lookup(in6, rt->rt_ifp); if (dr) defrtrlist_del(dr); else if (!ip6_forwarding && ip6_accept_rtadv) { /* * Even if the neighbor is not in the default * router list, the neighbor may be used * as a next hop for some destinations * (e.g. redirect case). So we must * call rt6_flush explicitly. */ rt6_flush(&ip6->ip6_src, rt->rt_ifp); } splx(s); } ln->ln_router = is_router; } rt->rt_flags &= ~RTF_REJECT; ln->ln_asked = 0; if (ln->ln_hold) { /* * we assume ifp is not a loopback here, so just set the 2nd * argument as the 1st one. */ nd6_output(ifp, ifp, ln->ln_hold, (struct sockaddr_in6 *)rt_key(rt), rt); ln->ln_hold = 0; } freeit: m_freem(m); return; bad: icmp6stat.icp6s_badna++; m_freem(m); } /* * Neighbor advertisement output handling. * * Based on RFC 2461 * * the following items are not implemented yet: * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD) * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD) */ void nd6_na_output(ifp, daddr6, taddr6, flags, tlladdr, sdl0) struct ifnet *ifp; const struct in6_addr *daddr6, *taddr6; u_long flags; int tlladdr; /* 1 if include target link-layer address */ struct sockaddr *sdl0; /* sockaddr_dl (= proxy NA) or NULL */ { struct mbuf *m; struct ip6_hdr *ip6; struct nd_neighbor_advert *nd_na; struct in6_ifaddr *ia = NULL; struct ip6_moptions im6o; int icmp6len; int maxlen; caddr_t mac = NULL; struct ifnet *outif = NULL; /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_na); maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7; if (max_linkhdr + maxlen >= MCLBYTES) { #ifdef DIAGNOSTIC printf("nd6_na_output: max_linkhdr + maxlen >= MCLBYTES " "(%d + %d > %d)\n", max_linkhdr, maxlen, MCLBYTES); #endif return; } MGETHDR(m, M_DONTWAIT, MT_DATA); if (m && max_linkhdr + maxlen >= MHLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); m = NULL; } } if (m == NULL) return; m->m_pkthdr.rcvif = NULL; if (IN6_IS_ADDR_MULTICAST(daddr6)) { m->m_flags |= M_MCAST; im6o.im6o_multicast_ifp = ifp; im6o.im6o_multicast_hlim = 255; im6o.im6o_multicast_loop = 0; } icmp6len = sizeof(*nd_na); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + icmp6len; m->m_data += max_linkhdr; /* or MH_ALIGN() equivalent? */ /* fill neighbor advertisement packet */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; if (IN6_IS_ADDR_UNSPECIFIED(daddr6)) { /* reply to DAD */ ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL; ip6->ip6_dst.s6_addr16[1] = htons(ifp->if_index); ip6->ip6_dst.s6_addr32[1] = 0; ip6->ip6_dst.s6_addr32[2] = 0; ip6->ip6_dst.s6_addr32[3] = IPV6_ADDR_INT32_ONE; flags &= ~ND_NA_FLAG_SOLICITED; } else ip6->ip6_dst = *daddr6; /* * Select a source whose scope is the same as that of the dest. */ ia = in6_ifawithifp(ifp, &ip6->ip6_dst); if (ia == NULL) { m_freem(m); return; } ip6->ip6_src = ia->ia_addr.sin6_addr; nd_na = (struct nd_neighbor_advert *)(ip6 + 1); nd_na->nd_na_type = ND_NEIGHBOR_ADVERT; nd_na->nd_na_code = 0; nd_na->nd_na_target = *taddr6; if (IN6_IS_SCOPE_LINKLOCAL(&nd_na->nd_na_target)) nd_na->nd_na_target.s6_addr16[1] = 0; /* * "tlladdr" indicates NS's condition for adding tlladdr or not. * see nd6_ns_input() for details. * Basically, if NS packet is sent to unicast/anycast addr, * target lladdr option SHOULD NOT be included. */ if (tlladdr) { /* * sdl0 != NULL indicates proxy NA. If we do proxy, use * lladdr in sdl0. If we are not proxying (sending NA for * my address) use lladdr configured for the interface. */ if (sdl0 == NULL) mac = nd6_ifptomac(ifp); else if (sdl0->sa_family == AF_LINK) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)sdl0; if (sdl->sdl_alen == ifp->if_addrlen) mac = LLADDR(sdl); } } if (tlladdr && mac) { int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_na + 1); /* roundup to 8 bytes alignment! */ optlen = (optlen + 7) & ~7; m->m_pkthdr.len += optlen; m->m_len += optlen; icmp6len += optlen; bzero((caddr_t)nd_opt, optlen); nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = optlen >> 3; bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen); } else flags &= ~ND_NA_FLAG_OVERRIDE; ip6->ip6_plen = htons((u_short)icmp6len); nd_na->nd_na_flags_reserved = flags; nd_na->nd_na_cksum = 0; nd_na->nd_na_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), icmp6len); -#ifdef IPSEC - /* Don't lookup socket */ - (void)ipsec_setsocket(m, NULL); -#endif - ip6_output(m, NULL, NULL, 0, &im6o, &outif); + ip6_output(m, NULL, NULL, 0, &im6o, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_neighboradvert); } icmp6stat.icp6s_outhist[ND_NEIGHBOR_ADVERT]++; } caddr_t nd6_ifptomac(ifp) struct ifnet *ifp; { switch (ifp->if_type) { case IFT_ARCNET: case IFT_ETHER: case IFT_FDDI: case IFT_IEEE1394: #ifdef IFT_L2VLAN case IFT_L2VLAN: #endif #ifdef IFT_IEEE80211 case IFT_IEEE80211: #endif return ((caddr_t)(ifp + 1)); break; default: return NULL; } } TAILQ_HEAD(dadq_head, dadq); struct dadq { TAILQ_ENTRY(dadq) dad_list; struct ifaddr *dad_ifa; int dad_count; /* max NS to send */ int dad_ns_tcount; /* # of trials to send NS */ int dad_ns_ocount; /* NS sent so far */ int dad_ns_icount; int dad_na_icount; struct callout dad_timer_ch; }; static struct dadq_head dadq; static int dad_init = 0; static struct dadq * nd6_dad_find(ifa) struct ifaddr *ifa; { struct dadq *dp; for (dp = dadq.tqh_first; dp; dp = dp->dad_list.tqe_next) { if (dp->dad_ifa == ifa) return dp; } return NULL; } static void nd6_dad_starttimer(dp, ticks) struct dadq *dp; int ticks; { callout_reset(&dp->dad_timer_ch, ticks, (void (*) __P((void *)))nd6_dad_timer, (void *)dp->dad_ifa); } static void nd6_dad_stoptimer(dp) struct dadq *dp; { callout_stop(&dp->dad_timer_ch); } /* * Start Duplicated Address Detection (DAD) for specified interface address. */ void nd6_dad_start(ifa, tick) struct ifaddr *ifa; int *tick; /* minimum delay ticks for IFF_UP event */ { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; if (!dad_init) { TAILQ_INIT(&dadq); dad_init++; } /* * If we don't need DAD, don't do it. * There are several cases: * - DAD is disabled (ip6_dad_count == 0) * - the interface address is anycast */ if (!(ia->ia6_flags & IN6_IFF_TENTATIVE)) { log(LOG_DEBUG, "nd6_dad_start: called with non-tentative address " "%s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); return; } if (ia->ia6_flags & IN6_IFF_ANYCAST) { ia->ia6_flags &= ~IN6_IFF_TENTATIVE; return; } if (!ip6_dad_count) { ia->ia6_flags &= ~IN6_IFF_TENTATIVE; return; } if (!ifa->ifa_ifp) panic("nd6_dad_start: ifa->ifa_ifp == NULL"); if (!(ifa->ifa_ifp->if_flags & IFF_UP)) return; if (nd6_dad_find(ifa) != NULL) { /* DAD already in progress */ return; } dp = malloc(sizeof(*dp), M_IP6NDP, M_NOWAIT); if (dp == NULL) { log(LOG_ERR, "nd6_dad_start: memory allocation failed for " "%s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); return; } bzero(dp, sizeof(*dp)); callout_init(&dp->dad_timer_ch, 0); TAILQ_INSERT_TAIL(&dadq, (struct dadq *)dp, dad_list); nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr))); /* * Send NS packet for DAD, ip6_dad_count times. * Note that we must delay the first transmission, if this is the * first packet to be sent from the interface after interface * (re)initialization. */ dp->dad_ifa = ifa; IFAREF(ifa); /* just for safety */ dp->dad_count = ip6_dad_count; dp->dad_ns_icount = dp->dad_na_icount = 0; dp->dad_ns_ocount = dp->dad_ns_tcount = 0; if (tick == NULL) { nd6_dad_ns_output(dp, ifa); nd6_dad_starttimer(dp, nd_ifinfo[ifa->ifa_ifp->if_index].retrans * hz / 1000); } else { int ntick; if (*tick == 0) ntick = random() % (MAX_RTR_SOLICITATION_DELAY * hz); else ntick = *tick + random() % (hz / 2); *tick = ntick; nd6_dad_starttimer(dp, ntick); } } /* * terminate DAD unconditionally. used for address removals. */ void nd6_dad_stop(ifa) struct ifaddr *ifa; { struct dadq *dp; if (!dad_init) return; dp = nd6_dad_find(ifa); if (!dp) { /* DAD wasn't started yet */ return; } nd6_dad_stoptimer(dp); TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); free(dp, M_IP6NDP); dp = NULL; IFAFREE(ifa); } static void nd6_dad_timer(ifa) struct ifaddr *ifa; { int s; struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; s = splnet(); /* XXX */ /* Sanity check */ if (ia == NULL) { log(LOG_ERR, "nd6_dad_timer: called with null parameter\n"); goto done; } dp = nd6_dad_find(ifa); if (dp == NULL) { log(LOG_ERR, "nd6_dad_timer: DAD structure not found\n"); goto done; } if (ia->ia6_flags & IN6_IFF_DUPLICATED) { log(LOG_ERR, "nd6_dad_timer: called with duplicated address " "%s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); goto done; } if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) { log(LOG_ERR, "nd6_dad_timer: called with non-tentative address " "%s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); goto done; } /* timeouted with IFF_{RUNNING,UP} check */ if (dp->dad_ns_tcount > dad_maxtry) { nd6log((LOG_INFO, "%s: could not run DAD, driver problem?\n", if_name(ifa->ifa_ifp))); TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); free(dp, M_IP6NDP); dp = NULL; IFAFREE(ifa); goto done; } /* Need more checks? */ if (dp->dad_ns_ocount < dp->dad_count) { /* * We have more NS to go. Send NS packet for DAD. */ nd6_dad_ns_output(dp, ifa); nd6_dad_starttimer(dp, nd_ifinfo[ifa->ifa_ifp->if_index].retrans * hz / 1000); } else { /* * We have transmitted sufficient number of DAD packets. * See what we've got. */ int duplicate; duplicate = 0; if (dp->dad_na_icount) { /* * the check is in nd6_dad_na_input(), * but just in case */ duplicate++; } if (dp->dad_ns_icount) { #if 0 /* heuristics */ /* * if * - we have sent many(?) DAD NS, and * - the number of NS we sent equals to the * number of NS we've got, and * - we've got no NA * we may have a faulty network card/driver which * loops back multicasts to myself. */ if (3 < dp->dad_count && dp->dad_ns_icount == dp->dad_count && dp->dad_na_icount == 0) { log(LOG_INFO, "DAD questionable for %s(%s): " "network card loops back multicast?\n", ip6_sprintf(&ia->ia_addr.sin6_addr), if_name(ifa->ifa_ifp)); /* XXX consider it a duplicate or not? */ /* duplicate++; */ } else { /* We've seen NS, means DAD has failed. */ duplicate++; } #else /* We've seen NS, means DAD has failed. */ duplicate++; #endif } if (duplicate) { /* (*dp) will be freed in nd6_dad_duplicated() */ dp = NULL; nd6_dad_duplicated(ifa); } else { /* * We are done with DAD. No NA came, no NS came. * duplicated address found. */ ia->ia6_flags &= ~IN6_IFF_TENTATIVE; nd6log((LOG_DEBUG, "%s: DAD complete for %s - no duplicates found\n", if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr))); TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); free(dp, M_IP6NDP); dp = NULL; IFAFREE(ifa); } } done: splx(s); } void nd6_dad_duplicated(ifa) struct ifaddr *ifa; { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; dp = nd6_dad_find(ifa); if (dp == NULL) { log(LOG_ERR, "nd6_dad_duplicated: DAD structure not found\n"); return; } log(LOG_ERR, "%s: DAD detected duplicate IPv6 address %s: " "NS in/out=%d/%d, NA in=%d\n", if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr), dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_na_icount); ia->ia6_flags &= ~IN6_IFF_TENTATIVE; ia->ia6_flags |= IN6_IFF_DUPLICATED; /* We are done with DAD, with duplicated address found. (failure) */ nd6_dad_stoptimer(dp); log(LOG_ERR, "%s: DAD complete for %s - duplicate found\n", if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr)); log(LOG_ERR, "%s: manual intervention required\n", if_name(ifa->ifa_ifp)); TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); free(dp, M_IP6NDP); dp = NULL; IFAFREE(ifa); } static void nd6_dad_ns_output(dp, ifa) struct dadq *dp; struct ifaddr *ifa; { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct ifnet *ifp = ifa->ifa_ifp; dp->dad_ns_tcount++; if ((ifp->if_flags & IFF_UP) == 0) { #if 0 printf("%s: interface down?\n", if_name(ifp)); #endif return; } if ((ifp->if_flags & IFF_RUNNING) == 0) { #if 0 printf("%s: interface not running?\n", if_name(ifp)); #endif return; } dp->dad_ns_ocount++; nd6_ns_output(ifp, NULL, &ia->ia_addr.sin6_addr, NULL, 1); } static void nd6_dad_ns_input(ifa) struct ifaddr *ifa; { struct in6_ifaddr *ia; struct ifnet *ifp; const struct in6_addr *taddr6; struct dadq *dp; int duplicate; if (!ifa) panic("ifa == NULL in nd6_dad_ns_input"); ia = (struct in6_ifaddr *)ifa; ifp = ifa->ifa_ifp; taddr6 = &ia->ia_addr.sin6_addr; duplicate = 0; dp = nd6_dad_find(ifa); /* Quickhack - completely ignore DAD NS packets */ if (dad_ignore_ns) { nd6log((LOG_INFO, "nd6_dad_ns_input: ignoring DAD NS packet for " "address %s(%s)\n", ip6_sprintf(taddr6), if_name(ifa->ifa_ifp))); return; } /* * if I'm yet to start DAD, someone else started using this address * first. I have a duplicate and you win. */ if (!dp || dp->dad_ns_ocount == 0) duplicate++; /* XXX more checks for loopback situation - see nd6_dad_timer too */ if (duplicate) { dp = NULL; /* will be freed in nd6_dad_duplicated() */ nd6_dad_duplicated(ifa); } else { /* * not sure if I got a duplicate. * increment ns count and see what happens. */ if (dp) dp->dad_ns_icount++; } } static void nd6_dad_na_input(ifa) struct ifaddr *ifa; { struct dadq *dp; if (!ifa) panic("ifa == NULL in nd6_dad_na_input"); dp = nd6_dad_find(ifa); if (dp) dp->dad_na_icount++; /* remove the address. */ nd6_dad_duplicated(ifa); } Index: head/sys/netinet6/raw_ip6.c =================================================================== --- head/sys/netinet6/raw_ip6.c (revision 105193) +++ head/sys/netinet6/raw_ip6.c (revision 105194) @@ -1,726 +1,719 @@ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.2 (Berkeley) 1/4/94 */ #include "opt_ipsec.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ENABLE_DEFAULT_SCOPE #include #endif #ifdef IPSEC #include #include #endif /*IPSEC*/ #include #define satosin6(sa) ((struct sockaddr_in6 *)(sa)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) /* * Raw interface to IP6 protocol. */ extern struct inpcbhead ripcb; extern struct inpcbinfo ripcbinfo; extern u_long rip_sendspace; extern u_long rip_recvspace; struct rip6stat rip6stat; /* * Setup generic address and protocol structures * for raw_input routine, then pass them along with * mbuf chain. */ int rip6_input(mp, offp, proto) struct mbuf **mp; int *offp, proto; { struct mbuf *m = *mp; register struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); register struct inpcb *in6p; struct inpcb *last = 0; struct mbuf *opts = NULL; struct sockaddr_in6 rip6src; rip6stat.rip6s_ipackets++; if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) { /* XXX send icmp6 host/port unreach? */ m_freem(m); return IPPROTO_DONE; } init_sin6(&rip6src, m); /* general init */ LIST_FOREACH(in6p, &ripcb, inp_list) { if ((in6p->in6p_vflag & INP_IPV6) == 0) continue; if (in6p->in6p_ip6_nxt && in6p->in6p_ip6_nxt != proto) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) continue; if (in6p->in6p_cksum != -1) { rip6stat.rip6s_isum++; if (in6_cksum(m, ip6->ip6_nxt, *offp, m->m_pkthdr.len - *offp)) { rip6stat.rip6s_badsum++; continue; } } if (last) { struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); #ifdef IPSEC /* * Check AH/ESP integrity. */ if (n && ipsec6_in_reject_so(n, last->inp_socket)) { m_freem(n); ipsec6stat.in_polvio++; /* do not inject data into pcb */ } else #endif /*IPSEC*/ if (n) { if (last->in6p_flags & IN6P_CONTROLOPTS || last->in6p_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(last, &opts, ip6, n); /* strip intermediate headers */ m_adj(n, *offp); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&rip6src, n, opts) == 0) { m_freem(n); if (opts) m_freem(opts); rip6stat.rip6s_fullsock++; } else sorwakeup(last->in6p_socket); opts = NULL; } } last = in6p; } #ifdef IPSEC /* * Check AH/ESP integrity. */ if (last && ipsec6_in_reject_so(m, last->inp_socket)) { m_freem(m); ipsec6stat.in_polvio++; ip6stat.ip6s_delivered--; /* do not inject data into pcb */ } else #endif /*IPSEC*/ if (last) { if (last->in6p_flags & IN6P_CONTROLOPTS || last->in6p_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(last, &opts, ip6, m); /* strip intermediate headers */ m_adj(m, *offp); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&rip6src, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); rip6stat.rip6s_fullsock++; } else sorwakeup(last->in6p_socket); } else { rip6stat.rip6s_nosock++; if (m->m_flags & M_MCAST) rip6stat.rip6s_nosockmcast++; if (proto == IPPROTO_NONE) m_freem(m); else { char *prvnxtp = ip6_get_prevhdr(m, *offp); /* XXX */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, prvnxtp - mtod(m, char *)); } ip6stat.ip6s_delivered--; } return IPPROTO_DONE; } void rip6_ctlinput(cmd, sa, d) int cmd; struct sockaddr *sa; void *d; { struct ip6_hdr *ip6; struct mbuf *m; int off = 0; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; struct inpcb *(*notify) __P((struct inpcb *, int)) = in6_rtchange; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if ((unsigned)cmd >= PRC_NCMDS) return; if (PRC_IS_REDIRECT(cmd)) notify = in6_rtchange, d = NULL; else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; sa6_src = &sa6_any; } (void) in6_pcbnotify(&ripcb, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, notify); } /* * Generate IPv6 header and pass packet to ip6_output. * Tack on options user may have setup with control call. */ int #if __STDC__ rip6_output(struct mbuf *m, ...) #else rip6_output(m, va_alist) struct mbuf *m; va_dcl #endif { struct socket *so; struct sockaddr_in6 *dstsock; struct mbuf *control; struct in6_addr *dst; struct ip6_hdr *ip6; struct inpcb *in6p; u_int plen = m->m_pkthdr.len; int error = 0; struct ip6_pktopts opt, *optp = 0; struct ifnet *oifp = NULL; int type = 0, code = 0; /* for ICMPv6 output statistics only */ int priv = 0; va_list ap; va_start(ap, m); so = va_arg(ap, struct socket *); dstsock = va_arg(ap, struct sockaddr_in6 *); control = va_arg(ap, struct mbuf *); va_end(ap); in6p = sotoin6pcb(so); priv = 0; if (so->so_cred->cr_uid == 0) priv = 1; dst = &dstsock->sin6_addr; if (control) { if ((error = ip6_setpktoptions(control, &opt, priv, 0)) != 0) goto bad; optp = &opt; } else optp = in6p->in6p_outputopts; /* * For an ICMPv6 packet, we should know its type and code * to update statistics. */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { struct icmp6_hdr *icmp6; if (m->m_len < sizeof(struct icmp6_hdr) && (m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) { error = ENOBUFS; goto bad; } icmp6 = mtod(m, struct icmp6_hdr *); type = icmp6->icmp6_type; code = icmp6->icmp6_code; } M_PREPEND(m, sizeof(*ip6), M_TRYWAIT); ip6 = mtod(m, struct ip6_hdr *); /* * Next header might not be ICMP6 but use its pseudo header anyway. */ ip6->ip6_dst = *dst; /* * If the scope of the destination is link-local, embed the interface * index in the address. * * XXX advanced-api value overrides sin6_scope_id */ if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) { struct in6_pktinfo *pi; /* * XXX Boundary check is assumed to be already done in * ip6_setpktoptions(). */ if (optp && (pi = optp->ip6po_pktinfo) && pi->ipi6_ifindex) { ip6->ip6_dst.s6_addr16[1] = htons(pi->ipi6_ifindex); oifp = ifnet_byindex(pi->ipi6_ifindex); } else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && in6p->in6p_moptions && in6p->in6p_moptions->im6o_multicast_ifp) { oifp = in6p->in6p_moptions->im6o_multicast_ifp; ip6->ip6_dst.s6_addr16[1] = htons(oifp->if_index); } else if (dstsock->sin6_scope_id) { /* boundary check */ if (dstsock->sin6_scope_id < 0 || if_index < dstsock->sin6_scope_id) { error = ENXIO; /* XXX EINVAL? */ goto bad; } ip6->ip6_dst.s6_addr16[1] = htons(dstsock->sin6_scope_id & 0xffff);/*XXX*/ } } /* * Source address selection. */ { struct in6_addr *in6a; if ((in6a = in6_selectsrc(dstsock, optp, in6p->in6p_moptions, &in6p->in6p_route, &in6p->in6p_laddr, &error)) == 0) { if (error == 0) error = EADDRNOTAVAIL; goto bad; } ip6->ip6_src = *in6a; if (in6p->in6p_route.ro_rt) oifp = ifnet_byindex(in6p->in6p_route.ro_rt->rt_ifp->if_index); } ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (in6p->in6p_flowinfo & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); /* ip6_plen will be filled in ip6_output, so not fill it here. */ ip6->ip6_nxt = in6p->in6p_ip6_nxt; ip6->ip6_hlim = in6_selecthlim(in6p, oifp); if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 || in6p->in6p_cksum != -1) { struct mbuf *n; int off; u_int16_t *p; /* compute checksum */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) off = offsetof(struct icmp6_hdr, icmp6_cksum); else off = in6p->in6p_cksum; if (plen < off + 1) { error = EINVAL; goto bad; } off += sizeof(struct ip6_hdr); n = m; while (n && n->m_len <= off) { off -= n->m_len; n = n->m_next; } if (!n) goto bad; p = (u_int16_t *)(mtod(n, caddr_t) + off); *p = 0; *p = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen); } -#ifdef IPSEC - if (ipsec_setsocket(m, so) != 0) { - error = ENOBUFS; - goto bad; - } -#endif /*IPSEC*/ - error = ip6_output(m, optp, &in6p->in6p_route, 0, - in6p->in6p_moptions, &oifp); + in6p->in6p_moptions, &oifp, in6p); if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (oifp) icmp6_ifoutstat_inc(oifp, type, code); icmp6stat.icp6s_outhist[type]++; } else rip6stat.rip6s_opackets++; goto freectl; bad: if (m) m_freem(m); freectl: if (optp == &opt && optp->ip6po_rthdr && optp->ip6po_route.ro_rt) RTFREE(optp->ip6po_route.ro_rt); if (control) { if (optp == &opt) ip6_clearpktopts(optp, 0, -1); m_freem(control); } return(error); } /* * Raw IPv6 socket option processing. */ int rip6_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { int error; if (sopt->sopt_level == IPPROTO_ICMPV6) /* * XXX: is it better to call icmp6_ctloutput() directly * from protosw? */ return(icmp6_ctloutput(so, sopt)); else if (sopt->sopt_level != IPPROTO_IPV6) return (EINVAL); error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: error = ip6_mrouter_get(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: error = ip6_mrouter_set(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; } return (error); } static int rip6_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error, s; inp = sotoinpcb(so); if (inp) panic("rip6_attach"); if (td && (error = suser(td)) != 0) return error; error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return error; s = splnet(); error = in_pcballoc(so, &ripcbinfo, td); splx(s); if (error) return error; inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV6; inp->in6p_ip6_nxt = (long)proto; inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; MALLOC(inp->in6p_icmp6filt, struct icmp6_filter *, sizeof(struct icmp6_filter), M_PCB, M_NOWAIT); ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt); return 0; } static int rip6_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); if (inp == 0) panic("rip6_detach"); /* xxx: RSVP */ if (so == ip6_mrouter) ip6_mrouter_done(); if (inp->in6p_icmp6filt) { FREE(inp->in6p_icmp6filt, M_PCB); inp->in6p_icmp6filt = NULL; } in6_pcbdetach(inp); return 0; } static int rip6_abort(struct socket *so) { soisdisconnected(so); return rip6_detach(so); } static int rip6_disconnect(struct socket *so) { struct inpcb *inp = sotoinpcb(so); if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN; inp->in6p_faddr = in6addr_any; return rip6_abort(so); } static int rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct ifaddr *ia = NULL; if (nam->sa_len != sizeof(*addr)) return EINVAL; if (TAILQ_EMPTY(&ifnet) || addr->sin6_family != AF_INET6) return EADDRNOTAVAIL; #ifdef ENABLE_DEFAULT_SCOPE if (addr->sin6_scope_id == 0) { /* not change if specified */ addr->sin6_scope_id = scope6_addr2default(&addr->sin6_addr); } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) && (ia = ifa_ifwithaddr((struct sockaddr *)addr)) == 0) return EADDRNOTAVAIL; if (ia && ((struct in6_ifaddr *)ia)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { return(EADDRNOTAVAIL); } inp->in6p_laddr = addr->sin6_addr; return 0; } static int rip6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct in6_addr *in6a = NULL; int error = 0; #ifdef ENABLE_DEFAULT_SCOPE struct sockaddr_in6 tmp; #endif if (nam->sa_len != sizeof(*addr)) return EINVAL; if (TAILQ_EMPTY(&ifnet)) return EADDRNOTAVAIL; if (addr->sin6_family != AF_INET6) return EAFNOSUPPORT; #ifdef ENABLE_DEFAULT_SCOPE if (addr->sin6_scope_id == 0) { /* not change if specified */ /* avoid overwrites */ tmp = *addr; addr = &tmp; addr->sin6_scope_id = scope6_addr2default(&addr->sin6_addr); } #endif /* Source address selection. XXX: need pcblookup? */ in6a = in6_selectsrc(addr, inp->in6p_outputopts, inp->in6p_moptions, &inp->in6p_route, &inp->in6p_laddr, &error); if (in6a == NULL) return (error ? error : EADDRNOTAVAIL); inp->in6p_laddr = *in6a; inp->in6p_faddr = addr->sin6_addr; soisconnected(so); return 0; } static int rip6_shutdown(struct socket *so) { socantsendmore(so); return 0; } static int rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 tmp; struct sockaddr_in6 *dst; /* always copy sockaddr to avoid overwrites */ if (so->so_state & SS_ISCONNECTED) { if (nam) { m_freem(m); return EISCONN; } /* XXX */ bzero(&tmp, sizeof(tmp)); tmp.sin6_family = AF_INET6; tmp.sin6_len = sizeof(struct sockaddr_in6); bcopy(&inp->in6p_faddr, &tmp.sin6_addr, sizeof(struct in6_addr)); dst = &tmp; } else { if (nam == NULL) { m_freem(m); return ENOTCONN; } tmp = *(struct sockaddr_in6 *)nam; dst = &tmp; } #ifdef ENABLE_DEFAULT_SCOPE if (dst->sin6_scope_id == 0) { /* not change if specified */ dst->sin6_scope_id = scope6_addr2default(&dst->sin6_addr); } #endif return rip6_output(m, so, dst, control); } struct pr_usrreqs rip6_usrreqs = { rip6_abort, pru_accept_notsupp, rip6_attach, rip6_bind, rip6_connect, pru_connect2_notsupp, in6_control, rip6_detach, rip6_disconnect, pru_listen_notsupp, in6_setpeeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, rip6_send, pru_sense_null, rip6_shutdown, in6_setsockaddr, sosend, soreceive, sopoll }; Index: head/sys/netinet6/route6.c =================================================================== --- head/sys/netinet6/route6.c (revision 105193) +++ head/sys/netinet6/route6.c (revision 105194) @@ -1,222 +1,221 @@ /* $FreeBSD$ */ /* $KAME: route6.c,v 1.24 2001/03/14 03:07:05 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include static int ip6_rthdr0 __P((struct mbuf *, struct ip6_hdr *, struct ip6_rthdr0 *)); int route6_input(mp, offp, proto) struct mbuf **mp; int *offp, proto; /* proto is unused */ { struct ip6_hdr *ip6; struct mbuf *m = *mp; struct ip6_rthdr *rh; int off = *offp, rhlen; - struct mbuf *n; + struct ip6aux *ip6a; - n = ip6_findaux(m); - if (n) { - struct ip6aux *ip6a = mtod(n, struct ip6aux *); + ip6a = ip6_findaux(m); + if (ip6a) { /* XXX reject home-address option before rthdr */ if (ip6a->ip6a_flags & IP6A_SWAP) { ip6stat.ip6s_badoptions++; m_freem(m); return IPPROTO_DONE; } } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(*rh), IPPROTO_DONE); ip6 = mtod(m, struct ip6_hdr *); rh = (struct ip6_rthdr *)((caddr_t)ip6 + off); #else ip6 = mtod(m, struct ip6_hdr *); IP6_EXTHDR_GET(rh, struct ip6_rthdr *, m, off, sizeof(*rh)); if (rh == NULL) { ip6stat.ip6s_tooshort++; return IPPROTO_DONE; } #endif switch (rh->ip6r_type) { case IPV6_RTHDR_TYPE_0: rhlen = (rh->ip6r_len + 1) << 3; #ifndef PULLDOWN_TEST /* * note on option length: * due to IP6_EXTHDR_CHECK assumption, we cannot handle * very big routing header (max rhlen == 2048). */ IP6_EXTHDR_CHECK(m, off, rhlen, IPPROTO_DONE); #else /* * note on option length: * maximum rhlen: 2048 * max mbuf m_pulldown can handle: MCLBYTES == usually 2048 * so, here we are assuming that m_pulldown can handle * rhlen == 2048 case. this may not be a good thing to * assume - we may want to avoid pulling it up altogether. */ IP6_EXTHDR_GET(rh, struct ip6_rthdr *, m, off, rhlen); if (rh == NULL) { ip6stat.ip6s_tooshort++; return IPPROTO_DONE; } #endif if (ip6_rthdr0(m, ip6, (struct ip6_rthdr0 *)rh)) return(IPPROTO_DONE); break; default: /* unknown routing type */ if (rh->ip6r_segleft == 0) { rhlen = (rh->ip6r_len + 1) << 3; break; /* Final dst. Just ignore the header. */ } ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&rh->ip6r_type - (caddr_t)ip6); return(IPPROTO_DONE); } *offp += rhlen; return(rh->ip6r_nxt); } /* * Type0 routing header processing * * RFC2292 backward compatibility warning: no support for strict/loose bitmap, * as it was dropped between RFC1883 and RFC2460. */ static int ip6_rthdr0(m, ip6, rh0) struct mbuf *m; struct ip6_hdr *ip6; struct ip6_rthdr0 *rh0; { int addrs, index; struct in6_addr *nextaddr, tmpaddr; if (rh0->ip6r0_segleft == 0) return(0); if (rh0->ip6r0_len % 2 #ifdef COMPAT_RFC1883 || rh0->ip6r0_len > 46 #endif ) { /* * Type 0 routing header can't contain more than 23 addresses. * RFC 2462: this limitation was removed since strict/loose * bitmap field was deleted. */ ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&rh0->ip6r0_len - (caddr_t)ip6); return(-1); } if ((addrs = rh0->ip6r0_len / 2) < rh0->ip6r0_segleft) { ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&rh0->ip6r0_segleft - (caddr_t)ip6); return(-1); } index = addrs - rh0->ip6r0_segleft; rh0->ip6r0_segleft--; /* note that ip6r0_addr does not exist in RFC2292bis */ nextaddr = rh0->ip6r0_addr + index; /* * reject invalid addresses. be proactive about malicious use of * IPv4 mapped/compat address. * XXX need more checks? */ if (IN6_IS_ADDR_MULTICAST(nextaddr) || IN6_IS_ADDR_UNSPECIFIED(nextaddr) || IN6_IS_ADDR_V4MAPPED(nextaddr) || IN6_IS_ADDR_V4COMPAT(nextaddr)) { ip6stat.ip6s_badoptions++; m_freem(m); return(-1); } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { ip6stat.ip6s_badoptions++; m_freem(m); return(-1); } /* * Swap the IPv6 destination address and nextaddr. Forward the packet. */ tmpaddr = *nextaddr; *nextaddr = ip6->ip6_dst; if (IN6_IS_ADDR_LINKLOCAL(nextaddr)) nextaddr->s6_addr16[1] = 0; ip6->ip6_dst = tmpaddr; if (IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) ip6->ip6_dst.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); #ifdef COMPAT_RFC1883 if (rh0->ip6r0_slmap[index / 8] & (1 << (7 - (index % 8)))) ip6_forward(m, IPV6_SRCRT_NEIGHBOR); else ip6_forward(m, IPV6_SRCRT_NOTNEIGHBOR); #else ip6_forward(m, 1); #endif return(-1); /* m would be freed in ip6_forward() */ } Index: head/sys/netinet6/udp6_output.c =================================================================== --- head/sys/netinet6/udp6_output.c (revision 105193) +++ head/sys/netinet6/udp6_output.c (revision 105194) @@ -1,318 +1,312 @@ /* $FreeBSD$ */ /* $KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_var.h 8.1 (Berkeley) 6/10/93 */ #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #ifdef INET6 #include #endif #endif /* IPSEC */ #include /* * UDP protocol inplementation. * Per RFC 768, August, 1980. */ #define in6pcb inpcb #define udp6stat udpstat #define udp6s_opackets udps_opackets int udp6_output(in6p, m, addr6, control, td) struct in6pcb *in6p; struct mbuf *m; struct mbuf *control; struct sockaddr *addr6; struct thread *td; { u_int32_t ulen = m->m_pkthdr.len; u_int32_t plen = sizeof(struct udphdr) + ulen; struct ip6_hdr *ip6; struct udphdr *udp6; struct in6_addr *laddr, *faddr; u_short fport; int error = 0; struct ip6_pktopts opt, *stickyopt = in6p->in6p_outputopts; int priv; int af = AF_INET6, hlen = sizeof(struct ip6_hdr); int flags; struct sockaddr_in6 tmp; priv = 0; if (td && !suser(td)) priv = 1; if (control) { if ((error = ip6_setpktoptions(control, &opt, priv, 0)) != 0) goto release; in6p->in6p_outputopts = &opt; } if (addr6) { /* * IPv4 version of udp_output calls in_pcbconnect in this case, * which needs splnet and affects performance. * Since we saw no essential reason for calling in_pcbconnect, * we get rid of such kind of logic, and call in6_selectsrc * and in6_pcbsetport in order to fill in the local address * and the local port. */ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr6; if (sin6->sin6_port == 0) { error = EADDRNOTAVAIL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) { /* how about ::ffff:0.0.0.0 case? */ error = EISCONN; goto release; } /* protect *sin6 from overwrites */ tmp = *sin6; sin6 = &tmp; faddr = &sin6->sin6_addr; fport = sin6->sin6_port; /* allow 0 port */ if (IN6_IS_ADDR_V4MAPPED(faddr)) { if ((in6p->in6p_flags & IN6P_IPV6_V6ONLY)) { /* * I believe we should explicitly discard the * packet when mapped addresses are disabled, * rather than send the packet as an IPv6 one. * If we chose the latter approach, the packet * might be sent out on the wire based on the * default route, the situation which we'd * probably want to avoid. * (20010421 jinmei@kame.net) */ error = EINVAL; goto release; } else af = AF_INET; } /* KAME hack: embed scopeid */ if (in6_embedscope(&sin6->sin6_addr, sin6, in6p, NULL) != 0) { error = EINVAL; goto release; } if (!IN6_IS_ADDR_V4MAPPED(faddr)) { laddr = in6_selectsrc(sin6, in6p->in6p_outputopts, in6p->in6p_moptions, &in6p->in6p_route, &in6p->in6p_laddr, &error); } else laddr = &in6p->in6p_laddr; /* XXX */ if (laddr == NULL) { if (error == 0) error = EADDRNOTAVAIL; goto release; } if (in6p->in6p_lport == 0 && (error = in6_pcbsetport(laddr, in6p, td)) != 0) goto release; } else { if (IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) { error = ENOTCONN; goto release; } if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) { if ((in6p->in6p_flags & IN6P_IPV6_V6ONLY)) { /* * XXX: this case would happen when the * application sets the V6ONLY flag after * connecting the foreign address. * Such applications should be fixed, * so we bark here. */ log(LOG_INFO, "udp6_output: IPV6_V6ONLY " "option was set for a connected socket\n"); error = EINVAL; goto release; } else af = AF_INET; } laddr = &in6p->in6p_laddr; faddr = &in6p->in6p_faddr; fport = in6p->in6p_fport; } if (af == AF_INET) hlen = sizeof(struct ip); /* * Calculate data length and get a mbuf * for UDP and IP6 headers. */ M_PREPEND(m, hlen + sizeof(struct udphdr), M_DONTWAIT); if (m == 0) { error = ENOBUFS; goto release; } /* * Stuff checksum and output datagram. */ udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen); udp6->uh_sport = in6p->in6p_lport; /* lport is always set in the PCB */ udp6->uh_dport = fport; if (plen <= 0xffff) udp6->uh_ulen = htons((u_short)plen); else udp6->uh_ulen = 0; udp6->uh_sum = 0; switch (af) { case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = in6p->in6p_flowinfo & IPV6_FLOWINFO_MASK; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; #if 0 /* ip6_plen will be filled in ip6_output. */ ip6->ip6_plen = htons((u_short)plen); #endif ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_hlim = in6_selecthlim(in6p, in6p->in6p_route.ro_rt ? in6p->in6p_route.ro_rt->rt_ifp : NULL); ip6->ip6_src = *laddr; ip6->ip6_dst = *faddr; if ((udp6->uh_sum = in6_cksum(m, IPPROTO_UDP, sizeof(struct ip6_hdr), plen)) == 0) { udp6->uh_sum = 0xffff; } flags = 0; udp6stat.udp6s_opackets++; -#ifdef IPSEC - if (ipsec_setsocket(m, in6p->in6p_socket) != 0) { - error = ENOBUFS; - goto release; - } -#endif /* IPSEC */ error = ip6_output(m, in6p->in6p_outputopts, &in6p->in6p_route, - flags, in6p->in6p_moptions, NULL); + flags, in6p->in6p_moptions, NULL, in6p); break; case AF_INET: error = EAFNOSUPPORT; goto release; } goto releaseopt; release: m_freem(m); releaseopt: if (control) { ip6_clearpktopts(in6p->in6p_outputopts, 0, -1); in6p->in6p_outputopts = stickyopt; m_freem(control); } return(error); } Index: head/sys/netipx/ipx_ip.c =================================================================== --- head/sys/netipx/ipx_ip.c (revision 105193) +++ head/sys/netipx/ipx_ip.c (revision 105194) @@ -1,454 +1,454 @@ /* * Copyright (c) 1995, Mike Mitchell * Copyright (c) 1984, 1985, 1986, 1987, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ipx_ip.c * * $FreeBSD$ */ /* * Software interface driver for encapsulating IPX in IP. */ #include "opt_inet.h" #include "opt_ipx.h" #ifdef IPXIP #ifndef INET #error The option IPXIP requires option INET. #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct ifnet ipxipif; /* list of all hosts and gateways or broadcast addrs */ static struct ifnet_en *ipxip_list; static struct ifnet_en *ipxipattach(void); static int ipxip_free(struct ifnet *ifp); static int ipxipioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int ipxipoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct rtentry *rt); static void ipxip_rtchange(struct in_addr *dst); static void ipxipstart(struct ifnet *ifp); static struct ifnet_en * ipxipattach() { register struct ifnet_en *m; register struct ifnet *ifp; if (ipxipif.if_mtu == 0) { ifp = &ipxipif; ifp->if_name = "ipxip"; ifp->if_mtu = LOMTU; ifp->if_ioctl = ipxipioctl; ifp->if_output = ipxipoutput; ifp->if_start = ipxipstart; ifp->if_flags = IFF_POINTOPOINT; } MALLOC((m), struct ifnet_en *, sizeof(*m), M_PCB, M_NOWAIT | M_ZERO); if (m == NULL) return (NULL); m->ifen_next = ipxip_list; ipxip_list = m; ifp = &m->ifen_ifnet; ifp->if_name = "ipxip"; ifp->if_mtu = LOMTU; ifp->if_ioctl = ipxipioctl; ifp->if_output = ipxipoutput; ifp->if_start = ipxipstart; ifp->if_flags = IFF_POINTOPOINT; ifp->if_unit = ipxipif.if_unit++; if_attach(ifp); return (m); } /* * Process an ioctl request. */ static int ipxipioctl(ifp, cmd, data) register struct ifnet *ifp; u_long cmd; caddr_t data; { int error = 0; struct ifreq *ifr; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; /* FALLTHROUGH */ case SIOCSIFDSTADDR: /* * Everything else is done at a higher level. */ break; case SIOCSIFFLAGS: ifr = (struct ifreq *)data; if ((ifr->ifr_flags & IFF_UP) == 0) error = ipxip_free(ifp); default: error = EINVAL; } return (error); } static struct mbuf *ipxip_badlen; static struct mbuf *ipxip_lastin; static int ipxip_hold_input; void ipxip_input(m, hlen) register struct mbuf *m; int hlen; { register struct ip *ip; register struct ipx *ipx; register struct ifqueue *ifq = &ipxintrq; int len, s; if (ipxip_hold_input) { if (ipxip_lastin != NULL) { m_freem(ipxip_lastin); } ipxip_lastin = m_copym(m, 0, (int)M_COPYALL, M_DONTWAIT); } /* * Get IP and IPX header together in first mbuf. */ ipxipif.if_ipackets++; s = sizeof(struct ip) + sizeof(struct ipx); if (((m->m_flags & M_EXT) || m->m_len < s) && (m = m_pullup(m, s)) == NULL) { ipxipif.if_ierrors++; return; } ip = mtod(m, struct ip *); if (ip->ip_hl > (sizeof(struct ip) >> 2)) { ip_stripoptions(m, (struct mbuf *)NULL); if (m->m_len < s) { if ((m = m_pullup(m, s)) == NULL) { ipxipif.if_ierrors++; return; } ip = mtod(m, struct ip *); } } /* * Make mbuf data length reflect IPX length. * If not enough data to reflect IPX length, drop. */ m->m_data += sizeof(struct ip); m->m_len -= sizeof(struct ip); m->m_pkthdr.len -= sizeof(struct ip); ipx = mtod(m, struct ipx *); len = ntohs(ipx->ipx_len); if (len & 1) len++; /* Preserve Garbage Byte */ if (ip->ip_len != len) { if (len > ip->ip_len) { ipxipif.if_ierrors++; if (ipxip_badlen) m_freem(ipxip_badlen); ipxip_badlen = m; return; } /* Any extra will be trimmed off by the IPX routines */ } /* * Deliver to IPX */ if (IF_HANDOFF(ifq, m, NULL)) schednetisr(NETISR_IPX); return; } static int ipxipoutput(ifp, m, dst, rt) struct ifnet *ifp; struct mbuf *m; struct sockaddr *dst; struct rtentry *rt; { register struct ifnet_en *ifn = (struct ifnet_en *)ifp; register struct ip *ip; register struct route *ro = &(ifn->ifen_route); register int len = 0; register struct ipx *ipx = mtod(m, struct ipx *); int error; ifn->ifen_ifnet.if_opackets++; ipxipif.if_opackets++; /* * Calculate data length and make space * for IP header. */ len = ntohs(ipx->ipx_len); if (len & 1) len++; /* Preserve Garbage Byte */ /* following clause not necessary on vax */ if (3 & (int)m->m_data) { /* force longword alignment of ip hdr */ struct mbuf *m0 = m_gethdr(MT_HEADER, M_DONTWAIT); if (m0 == NULL) { m_freem(m); return (ENOBUFS); } MH_ALIGN(m0, sizeof(struct ip)); m0->m_flags = m->m_flags & M_COPYFLAGS; m0->m_next = m; m0->m_len = sizeof(struct ip); m0->m_pkthdr.len = m0->m_len + m->m_len; m->m_flags &= ~M_PKTHDR; m = m0; } else { M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); if (m == NULL) return (ENOBUFS); } /* * Fill in IP header. */ ip = mtod(m, struct ip *); *(long *)ip = 0; ip->ip_p = IPPROTO_IDP; ip->ip_src = ifn->ifen_src; ip->ip_dst = ifn->ifen_dst; ip->ip_len = (u_short)len + sizeof(struct ip); ip->ip_ttl = MAXTTL; /* * Output final datagram. */ - error = (ip_output(m, (struct mbuf *)NULL, ro, SO_BROADCAST, NULL)); + error = (ip_output(m, (struct mbuf *)NULL, ro, SO_BROADCAST, NULL, NULL)); if (error) { ifn->ifen_ifnet.if_oerrors++; ifn->ifen_ifnet.if_ierrors = error; } return (error); m_freem(m); return (ENETUNREACH); } static void ipxipstart(ifp) struct ifnet *ifp; { panic("ipxip_start called\n"); } static struct ifreq ifr_ipxip = {"ipxip0"}; int ipxip_route(so, sopt) struct socket *so; struct sockopt *sopt; { int error; struct ifnet_en *ifn; struct sockaddr_in *src; struct ipxip_req rq; struct sockaddr_ipx *ipx_dst; struct sockaddr_in *ip_dst; struct route ro; error = sooptcopyin(sopt, &rq, sizeof rq, sizeof rq); if (error) return (error); ipx_dst = (struct sockaddr_ipx *)&rq.rq_ipx; ip_dst = (struct sockaddr_in *)&rq.rq_ip; /* * First, make sure we already have an IPX address: */ if (ipx_ifaddr == NULL) return (EADDRNOTAVAIL); /* * Now, determine if we can get to the destination */ bzero((caddr_t)&ro, sizeof(ro)); ro.ro_dst = *(struct sockaddr *)ip_dst; rtalloc(&ro); if (ro.ro_rt == NULL || ro.ro_rt->rt_ifp == NULL) { return (ENETUNREACH); } /* * And see how he's going to get back to us: * i.e., what return ip address do we use? */ { register struct in_ifaddr *ia; struct ifnet *ifp = ro.ro_rt->rt_ifp; for (ia = TAILQ_FIRST(&in_ifaddrhead); ia != NULL; ia = TAILQ_NEXT(ia, ia_link)) if (ia->ia_ifp == ifp) break; if (ia == NULL) ia = TAILQ_FIRST(&in_ifaddrhead); if (ia == NULL) { RTFREE(ro.ro_rt); return (EADDRNOTAVAIL); } src = (struct sockaddr_in *)&ia->ia_addr; } /* * Is there a free (pseudo-)interface or space? */ for (ifn = ipxip_list; ifn != NULL; ifn = ifn->ifen_next) { if ((ifn->ifen_ifnet.if_flags & IFF_UP) == 0) break; } if (ifn == NULL) ifn = ipxipattach(); if (ifn == NULL) { RTFREE(ro.ro_rt); return (ENOBUFS); } ifn->ifen_route = ro; ifn->ifen_dst = ip_dst->sin_addr; ifn->ifen_src = src->sin_addr; /* * now configure this as a point to point link */ ifr_ipxip.ifr_name[4] = '0' + ipxipif.if_unit - 1; ifr_ipxip.ifr_dstaddr = *(struct sockaddr *)ipx_dst; ipx_control(so, (int)SIOCSIFDSTADDR, (caddr_t)&ifr_ipxip, (struct ifnet *)ifn, sopt->sopt_td); /* use any of our addresses */ satoipx_addr(ifr_ipxip.ifr_addr).x_host = ipx_ifaddr->ia_addr.sipx_addr.x_host; return (ipx_control(so, (int)SIOCSIFADDR, (caddr_t)&ifr_ipxip, (struct ifnet *)ifn, sopt->sopt_td)); } static int ipxip_free(ifp) struct ifnet *ifp; { register struct ifnet_en *ifn = (struct ifnet_en *)ifp; struct route *ro = & ifn->ifen_route; if (ro->ro_rt != NULL) { RTFREE(ro->ro_rt); ro->ro_rt = NULL; } ifp->if_flags &= ~IFF_UP; return (0); } void ipxip_ctlinput(cmd, sa, dummy) int cmd; struct sockaddr *sa; void *dummy; { struct sockaddr_in *sin; if ((unsigned)cmd >= PRC_NCMDS) return; if (sa->sa_family != AF_INET && sa->sa_family != AF_IMPLINK) return; sin = (struct sockaddr_in *)sa; if (sin->sin_addr.s_addr == INADDR_ANY) return; switch (cmd) { case PRC_ROUTEDEAD: case PRC_REDIRECT_NET: case PRC_REDIRECT_HOST: case PRC_REDIRECT_TOSNET: case PRC_REDIRECT_TOSHOST: ipxip_rtchange(&sin->sin_addr); break; } } static void ipxip_rtchange(dst) register struct in_addr *dst; { register struct ifnet_en *ifn; for (ifn = ipxip_list; ifn != NULL; ifn = ifn->ifen_next) { if (ifn->ifen_dst.s_addr == dst->s_addr && ifn->ifen_route.ro_rt != NULL) { RTFREE(ifn->ifen_route.ro_rt); ifn->ifen_route.ro_rt = NULL; } } } #endif /* IPXIP */ Index: head/sys/sys/mbuf.h =================================================================== --- head/sys/sys/mbuf.h (revision 105193) +++ head/sys/sys/mbuf.h (revision 105194) @@ -1,499 +1,554 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mbuf.h 8.5 (Berkeley) 2/19/95 * $FreeBSD$ */ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ #include +#include /* * Mbufs are of a single size, MSIZE (machine/param.h), which * includes overhead. An mbuf may add a single "mbuf cluster" of size * MCLBYTES (also in machine/param.h), which has no additional overhead * and is used instead of the internal data area; this is done when * at least MINCLSIZE of data must be stored. Additionally, it is possible * to allocate a separate buffer externally and attach it to the mbuf in * a way similar to that of mbuf clusters. */ #define MLEN (MSIZE - sizeof(struct m_hdr)) /* normal data len */ #define MHLEN (MLEN - sizeof(struct pkthdr)) /* data len w/pkthdr */ #define MINCLSIZE (MHLEN + 1) /* smallest amount to put in cluster */ #define M_MAXCOMPRESS (MHLEN / 2) /* max amount to copy for compression */ #ifdef _KERNEL /*- * Macros for type conversion: * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. * dtom(x) -- Convert data pointer within mbuf to mbuf pointer (XXX). */ #define mtod(m, t) ((t)((m)->m_data)) #define dtom(x) ((struct mbuf *)((intptr_t)(x) & ~(MSIZE-1))) #endif /* _KERNEL */ /* * Header present at the beginning of every mbuf. */ struct m_hdr { struct mbuf *mh_next; /* next buffer in chain */ struct mbuf *mh_nextpkt; /* next chain in queue/record */ caddr_t mh_data; /* location of data */ int mh_len; /* amount of data in this mbuf */ int mh_flags; /* flags; see below */ short mh_type; /* type of data in this mbuf */ }; /* + * Packet tag structure (see below for details). + */ +struct m_tag { + SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ + u_int16_t m_tag_id; /* Tag ID */ + u_int16_t m_tag_len; /* Length of data */ + u_int32_t m_tag_cookie; /* ABI/Module ID */ +}; + +/* * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set. */ struct pkthdr { struct ifnet *rcvif; /* rcv interface */ int len; /* total packet length */ /* variables for ip and tcp reassembly */ void *header; /* pointer to packet header */ /* variables for hardware checksum */ int csum_flags; /* flags regarding checksum */ int csum_data; /* data field used by csum routines */ - struct mbuf *aux; /* extra data buffer; ipsec/others */ + SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ struct label label; /* MAC label of data in packet */ }; /* * Description of external storage mapped into mbuf; valid only if M_EXT is set. */ struct m_ext { caddr_t ext_buf; /* start of buffer */ void (*ext_free) /* free routine if not the usual */ (void *, void *); void *ext_args; /* optional argument pointer */ u_int ext_size; /* size of buffer, for ext_free */ u_int *ref_cnt; /* pointer to ref count info */ int ext_type; /* type of external storage */ }; /* * The core of the mbuf object along with some shortcut defines for * practical purposes. */ struct mbuf { struct m_hdr m_hdr; union { struct { struct pkthdr MH_pkthdr; /* M_PKTHDR set */ union { struct m_ext MH_ext; /* M_EXT set */ char MH_databuf[MHLEN]; } MH_dat; } MH; char M_databuf[MLEN]; /* !M_PKTHDR, !M_EXT */ } M_dat; }; #define m_next m_hdr.mh_next #define m_len m_hdr.mh_len #define m_data m_hdr.mh_data #define m_type m_hdr.mh_type #define m_flags m_hdr.mh_flags #define m_nextpkt m_hdr.mh_nextpkt #define m_act m_nextpkt #define m_pkthdr M_dat.MH.MH_pkthdr #define m_ext M_dat.MH.MH_dat.MH_ext #define m_pktdat M_dat.MH.MH_dat.MH_databuf #define m_dat M_dat.M_databuf /* * mbuf flags. */ #define M_EXT 0x0001 /* has associated external storage */ #define M_PKTHDR 0x0002 /* start of record */ #define M_EOR 0x0004 /* end of record */ #define M_RDONLY 0x0008 /* associated data is marked read-only */ #define M_PROTO1 0x0010 /* protocol-specific */ #define M_PROTO2 0x0020 /* protocol-specific */ #define M_PROTO3 0x0040 /* protocol-specific */ #define M_PROTO4 0x0080 /* protocol-specific */ #define M_PROTO5 0x0100 /* protocol-specific */ /* * mbuf pkthdr flags (also stored in m_flags). */ #define M_BCAST 0x0200 /* send/received as link-level broadcast */ #define M_MCAST 0x0400 /* send/received as link-level multicast */ #define M_FRAG 0x0800 /* packet is a fragment of a larger packet */ #define M_FIRSTFRAG 0x1000 /* packet is first fragment */ #define M_LASTFRAG 0x2000 /* packet is last fragment */ /* * External buffer types: identify ext_buf type. */ #define EXT_CLUSTER 1 /* mbuf cluster */ #define EXT_SFBUF 2 /* sendfile(2)'s sf_bufs */ #define EXT_NET_DRV 100 /* custom ext_buf provided by net driver(s) */ #define EXT_MOD_TYPE 200 /* custom module's ext_buf type */ #define EXT_DISPOSABLE 300 /* can throw this buffer away w/page flipping */ /* * Flags copied when copying m_pkthdr. */ #define M_COPYFLAGS (M_PKTHDR|M_EOR|M_PROTO1|M_PROTO1|M_PROTO2|M_PROTO3 | \ M_PROTO4|M_PROTO5|M_BCAST|M_MCAST|M_FRAG|M_RDONLY) /* * Flags indicating hw checksum support and sw checksum requirements. */ #define CSUM_IP 0x0001 /* will csum IP */ #define CSUM_TCP 0x0002 /* will csum TCP */ #define CSUM_UDP 0x0004 /* will csum UDP */ #define CSUM_IP_FRAGS 0x0008 /* will csum IP fragments */ #define CSUM_FRAGMENT 0x0010 /* will do IP fragmentation */ #define CSUM_IP_CHECKED 0x0100 /* did csum IP */ #define CSUM_IP_VALID 0x0200 /* ... the csum is valid */ #define CSUM_DATA_VALID 0x0400 /* csum_data field is valid */ #define CSUM_PSEUDO_HDR 0x0800 /* csum_data has pseudo hdr */ #define CSUM_DELAY_DATA (CSUM_TCP | CSUM_UDP) #define CSUM_DELAY_IP (CSUM_IP) /* XXX add ipv6 here too? */ /* * mbuf types. */ #define MT_NOTMBUF 0 /* USED INTERNALLY ONLY! Object is not mbuf */ #define MT_DATA 1 /* dynamic (data) allocation */ #define MT_HEADER 2 /* packet header */ #if 0 #define MT_SOCKET 3 /* socket structure */ #define MT_PCB 4 /* protocol control block */ #define MT_RTABLE 5 /* routing tables */ #define MT_HTABLE 6 /* IMP host tables */ #define MT_ATABLE 7 /* address resolution tables */ #endif #define MT_SONAME 8 /* socket name */ #if 0 #define MT_SOOPTS 10 /* socket options */ #endif #define MT_FTABLE 11 /* fragment reassembly header */ #if 0 #define MT_RIGHTS 12 /* access rights */ #define MT_IFADDR 13 /* interface address */ #endif #define MT_TAG 13 /* volatile metadata associated to pkts */ #define MT_CONTROL 14 /* extra-data protocol message */ #define MT_OOBDATA 15 /* expedited data */ #define MT_NTYPES 16 /* number of mbuf types for mbtypes[] */ /* * Mbuf and cluster allocation statistics PCPU structure. */ struct mbpstat { u_long mb_mbfree; u_long mb_mbpgs; u_long mb_clfree; u_long mb_clpgs; long mb_mbtypes[MT_NTYPES]; short mb_active; }; /* * General mbuf allocator statistics structure. * XXX: Modifications of these are not protected by any mutex locks nor by * any atomic() manipulations. As a result, we may occasionally lose * a count or two. Luckily, not all of these fields are modified at all * and remain static, and those that are manipulated are only manipulated * in failure situations, which do not occur (hopefully) very often. */ struct mbstat { u_long m_drops; /* times failed to allocate */ u_long m_wait; /* times succesfully returned from wait */ u_long m_drain; /* times drained protocols for space */ u_long m_mcfail; /* XXX: times m_copym failed */ u_long m_mpfail; /* XXX: times m_pullup failed */ u_long m_msize; /* length of an mbuf */ u_long m_mclbytes; /* length of an mbuf cluster */ u_long m_minclsize; /* min length of data to allocate a cluster */ u_long m_mlen; /* length of data in an mbuf */ u_long m_mhlen; /* length of data in a header mbuf */ /* Number of mbtypes (gives # elems in mbpstat's mb_mbtypes[] array: */ short m_numtypes; }; /* * Flags specifying how an allocation should be made. * M_DONTWAIT means "don't block if nothing is available" whereas * M_TRYWAIT means "block for mbuf_wait ticks at most if nothing is * available." */ #define M_DONTWAIT 1 #define M_TRYWAIT 0 #define M_WAIT M_TRYWAIT /* XXX: Deprecated. */ #ifdef _KERNEL /*- * mbuf external reference count management macros. * * MEXT_IS_REF(m): true if (m) is not the only mbuf referencing * the external buffer ext_buf. * * MEXT_REM_REF(m): remove reference to m_ext object. * * MEXT_ADD_REF(m): add reference to m_ext object already * referred to by (m). */ #define MEXT_IS_REF(m) (*((m)->m_ext.ref_cnt) > 1) #define MEXT_REM_REF(m) do { \ KASSERT(*((m)->m_ext.ref_cnt) > 0, ("m_ext refcnt < 0")); \ atomic_subtract_int((m)->m_ext.ref_cnt, 1); \ } while(0) #define MEXT_ADD_REF(m) atomic_add_int((m)->m_ext.ref_cnt, 1) /* * mbuf, cluster, and external object allocation macros * (for compatibility purposes). */ #define M_COPY_PKTHDR(to, from) m_copy_pkthdr((to), (from)) #define m_getclr(how, type) m_get_clrd((how), (type)) #define MGET(m, how, type) ((m) = m_get((how), (type))) #define MGETHDR(m, how, type) ((m) = m_gethdr((how), (type))) #define MCLGET(m, how) m_clget((m), (how)) #define MEXTADD(m, buf, size, free, args, flags, type) \ m_extadd((m), (caddr_t)(buf), (size), (free), (args), (flags), (type)) /* * MEXTFREE(m): disassociate (and possibly free) an external object from (m). * * If the atomic_cmpset_int() returns 0, then we effectively do nothing * in terms of "cleaning up" (freeing the ext buf and ref. counter) as * this means that either there are still references, or another thread * is taking care of the clean-up. */ #define MEXTFREE(m) do { \ struct mbuf *_mb = (m); \ \ MEXT_REM_REF(_mb); \ if (atomic_cmpset_int(_mb->m_ext.ref_cnt, 0, 1)) \ _mext_free(_mb); \ _mb->m_flags &= ~M_EXT; \ } while (0) /* * Evaluate TRUE if it's safe to write to the mbuf m's data region (this * can be both the local data payload, or an external buffer area, * depending on whether M_EXT is set). */ #define M_WRITABLE(m) (!((m)->m_flags & M_RDONLY) && (!((m)->m_flags \ & M_EXT) || !MEXT_IS_REF(m))) /* * Set the m_data pointer of a newly-allocated mbuf (m_get/MGET) to place * an object of the specified size at the end of the mbuf, longword aligned. */ #define M_ALIGN(m, len) do { \ (m)->m_data += (MLEN - (len)) & ~(sizeof(long) - 1); \ } while (0) /* * As above, for mbufs allocated with m_gethdr/MGETHDR * or initialized by M_COPY_PKTHDR. */ #define MH_ALIGN(m, len) do { \ (m)->m_data += (MHLEN - (len)) & ~(sizeof(long) - 1); \ } while (0) /* * Compute the amount of space available * before the current start of data in an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. */ #define M_LEADINGSPACE(m) \ ((m)->m_flags & M_EXT ? \ (M_WRITABLE(m) ? (m)->m_data - (m)->m_ext.ext_buf : 0): \ (m)->m_flags & M_PKTHDR ? (m)->m_data - (m)->m_pktdat : \ (m)->m_data - (m)->m_dat) /* * Compute the amount of space available * after the end of data in an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. */ #define M_TRAILINGSPACE(m) \ ((m)->m_flags & M_EXT ? \ (M_WRITABLE(m) ? (m)->m_ext.ext_buf + (m)->m_ext.ext_size \ - ((m)->m_data + (m)->m_len) : 0) : \ &(m)->m_dat[MLEN] - ((m)->m_data + (m)->m_len)) /* * Arrange to prepend space of size plen to mbuf m. * If a new mbuf must be allocated, how specifies whether to wait. * If the allocation fails, the original mbuf chain is freed and m is * set to NULL. */ #define M_PREPEND(m, plen, how) do { \ struct mbuf **_mmp = &(m); \ struct mbuf *_mm = *_mmp; \ int _mplen = (plen); \ int __mhow = (how); \ \ if (M_LEADINGSPACE(_mm) >= _mplen) { \ _mm->m_data -= _mplen; \ _mm->m_len += _mplen; \ } else \ _mm = m_prepend(_mm, _mplen, __mhow); \ if (_mm != NULL && _mm->m_flags & M_PKTHDR) \ _mm->m_pkthdr.len += _mplen; \ *_mmp = _mm; \ } while (0) /* * Change mbuf to new type. * This is a relatively expensive operation and should be avoided. */ #define MCHTYPE(m, t) m_chtype((m), (t)) /* Length to m_copy to copy all. */ #define M_COPYALL 1000000000 /* Compatibility with 4.3. */ #define m_copy(m, o, l) m_copym((m), (o), (l), M_DONTWAIT) -/* - * pkthdr.aux type tags. - */ -struct mauxtag { - int af; - int type; - void *p; -}; - -/*- - * Some packet tags to identify different mbuf annotations. - * - * Eventually, these annotations will end up in an appropriate chain - * (struct m_tag or similar, e.g. as in NetBSD) properly managed by - * the mbuf handling routines. - * - * As a temporary and low impact solution to replace the even uglier - * approach used so far in some parts of the network stack (which relies - * on global variables), these annotations are stored in MT_TAG - * mbufs (or lookalikes) prepended to the actual mbuf chain. - * - * m_type = MT_TAG - * m_flags = m_tag_id - * m_next = next buffer in chain. - * - * BE VERY CAREFUL not to pass these blocks to the mbuf handling routines. - */ - -#define m_tag_id m_hdr.mh_flags - -/* Packet tag types -- first ones are from NetBSD */ - -#define PACKET_TAG_NONE 0 /* Nadda */ -#define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ -#define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ -#define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ -#define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ -#define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ -#define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ -#define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ -#define PACKET_TAG_GIF 8 /* GIF processing done */ -#define PACKET_TAG_GRE 9 /* GRE processing done */ -#define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ -#define PACKET_TAG_ENCAP 11 /* Encap. processing */ -#define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ -#define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ -#define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ - -/* Packet tags used in the FreeBSD network stack */ -#define PACKET_TAG_DUMMYNET 15 /* dummynet info */ -#define PACKET_TAG_IPFW 16 /* ipfw classification */ -#define PACKET_TAG_DIVERT 17 /* divert info */ -#define PACKET_TAG_IPFORWARD 18 /* ipforward info */ - -#define PACKET_TAG_MAX 19 - extern int max_datalen; /* MHLEN - max_hdr */ extern int max_hdr; /* Largest link + protocol header */ extern int max_linkhdr; /* Largest link-level header */ extern int max_protohdr; /* Largest protocol header */ extern struct mbpstat mb_statpcpu[]; /* Per-CPU allocation stats */ extern struct mbstat mbstat; /* General mbuf stats/infos */ extern int nmbclusters; /* Maximum number of clusters */ extern int nmbcnt; /* Scale kmem_map for counter space */ extern int nmbufs; /* Maximum number of mbufs */ extern int nsfbufs; /* Number of sendfile(2) bufs */ void _mext_free(struct mbuf *); void m_adj(struct mbuf *, int); -struct mbuf *m_aux_add(struct mbuf *, int, int); -struct mbuf *m_aux_add2(struct mbuf *, int, int, void *); -void m_aux_delete(struct mbuf *, struct mbuf *); -struct mbuf *m_aux_find(struct mbuf *, int, int); -struct mbuf *m_aux_find2(struct mbuf *, int, int, void *); void m_cat(struct mbuf *, struct mbuf *); void m_chtype(struct mbuf *, short); void m_clget(struct mbuf *, int); void m_extadd(struct mbuf *, caddr_t, u_int, void (*)(void *, void *), void *, int, int); void m_copyback(struct mbuf *, int, int, caddr_t); void m_copydata(const struct mbuf *, int, int, caddr_t); struct mbuf *m_copym(struct mbuf *, int, int, int); struct mbuf *m_copypacket(struct mbuf *, int); void m_copy_pkthdr(struct mbuf *, struct mbuf *); struct mbuf *m_devget(char *, int, int, struct ifnet *, void (*)(char *, caddr_t, u_int)); struct mbuf *m_dup(struct mbuf *, int); u_int m_fixhdr(struct mbuf *); struct mbuf *m_free(struct mbuf *); void m_freem(struct mbuf *); struct mbuf *m_get(int, short); struct mbuf *m_get_clrd(int, short); struct mbuf *m_getcl(int, short, int); struct mbuf *m_gethdr(int, short); struct mbuf *m_gethdr_clrd(int, short); struct mbuf *m_getm(struct mbuf *, int, int, short); u_int m_length(struct mbuf *, struct mbuf **); struct mbuf *m_prepend(struct mbuf *, int, int); void m_print(const struct mbuf *); struct mbuf *m_pulldown(struct mbuf *, int, int, int *); struct mbuf *m_pullup(struct mbuf *, int); struct mbuf *m_split(struct mbuf *, int, int); + +/* + * Packets may have annotations attached by affixing a list + * of "packet tags" to the pkthdr structure. Packet tags are + * dynamically allocated semi-opaque data structures that have + * a fixed header (struct m_tag) that specifies the size of the + * memory block and a pair that identifies it. + * The cookie is a 32-bit unique unsigned value used to identify + * a module or ABI. By convention this value is chose as the + * date+time that the module is created, expressed as the number of + * seconds since the epoch (e.g. using date -u +'%s'). The type value + * is an ABI/module-specific value that identifies a particular annotation + * and is private to the module. For compatibility with systems + * like openbsd that define packet tags w/o an ABI/module cookie, + * the value PACKET_ABI_COMPAT is used to implement m_tag_get and + * m_tag_find compatibility shim functions and several tag types are + * defined below. Users that do not require compatibility should use + * a private cookie value so that packet tag-related definitions + * can be maintained privately. + * + * Note that the packet tag returned by m_tag_allocate has the default + * memory alignment implemented by malloc. To reference private data + * one can use a construct like: + * + * struct m_tag *mtag = m_tag_allocate(...); + * struct foo *p = (struct foo *)(mtag+1); + * + * if the alignment of struct m_tag is sufficient for referencing members + * of struct foo. Otherwise it is necessary to embed struct m_tag within + * the private data structure to insure proper alignment; e.g. + * + * struct foo { + * struct m_tag tag; + * ... + * }; + * struct foo *p = (struct foo *) m_tag_allocate(...); + * struct m_tag *mtag = &p->tag; + */ + +#define PACKET_TAG_NONE 0 /* Nadda */ + +/* Packet tag for use with PACKET_ABI_COMPAT */ +#define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ +#define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ +#define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ +#define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ +#define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ +#define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ +#define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ +#define PACKET_TAG_GIF 8 /* GIF processing done */ +#define PACKET_TAG_GRE 9 /* GRE processing done */ +#define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ +#define PACKET_TAG_ENCAP 11 /* Encap. processing */ +#define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ +#define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ +#define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ + +/* + * As a temporary and low impact solution to replace the even uglier + * approach used so far in some parts of the network stack (which relies + * on global variables), packet tag-like annotations are stored in MT_TAG + * mbufs (or lookalikes) prepended to the actual mbuf chain. + * + * m_type = MT_TAG + * m_flags = m_tag_id + * m_next = next buffer in chain. + * + * BE VERY CAREFUL not to pass these blocks to the mbuf handling routines. + */ +#define _m_tag_id m_hdr.mh_flags + +/* Packet tags used in the FreeBSD network stack */ +#define PACKET_TAG_DUMMYNET 15 /* dummynet info */ +#define PACKET_TAG_IPFW 16 /* ipfw classification */ +#define PACKET_TAG_DIVERT 17 /* divert info */ +#define PACKET_TAG_IPFORWARD 18 /* ipforward info */ + +/* Packet tag routines */ +struct m_tag *m_tag_alloc(u_int32_t, int, int, int); +void m_tag_free(struct m_tag *); +void m_tag_prepend(struct mbuf *, struct m_tag *); +void m_tag_unlink(struct mbuf *, struct m_tag *); +void m_tag_delete(struct mbuf *, struct m_tag *); +void m_tag_delete_chain(struct mbuf *, struct m_tag *); +struct m_tag *m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *); +struct m_tag *m_tag_copy(struct m_tag *); +int m_tag_copy_chain(struct mbuf *, struct mbuf *); +void m_tag_init(struct mbuf *); +struct m_tag *m_tag_first(struct mbuf *); +struct m_tag *m_tag_next(struct mbuf *, struct m_tag *); + +/* these are for openbsd compatibility */ +#define MTAG_ABI_COMPAT 0 /* compatibility ABI */ + +static __inline struct m_tag * +m_tag_get(int type, int length, int wait) +{ + return m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait); +} + +static __inline struct m_tag * +m_tag_find(struct mbuf *m, int type, struct m_tag *start) +{ + return m_tag_locate(m, MTAG_ABI_COMPAT, type, start); +} #endif /* _KERNEL */ #endif /* !_SYS_MBUF_H_ */