Index: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c =================================================================== --- head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c (revision 185087) +++ head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c (revision 185088) @@ -1,4473 +1,4468 @@ /************************************************************************** Copyright (c) 2007-2008, Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Neither the name of the Chelsio Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if __FreeBSD_version >= 800044 #include #else #define V_tcp_do_autosndbuf tcp_do_autosndbuf #define V_tcp_autosndbuf_max tcp_autosndbuf_max #define V_tcp_do_rfc1323 tcp_do_rfc1323 #define V_tcp_do_autorcvbuf tcp_do_autorcvbuf #define V_tcp_autorcvbuf_max tcp_autorcvbuf_max #define V_tcpstat tcpstat #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * For ULP connections HW may add headers, e.g., for digests, that aren't part * of the messages sent by the host but that are part of the TCP payload and * therefore consume TCP sequence space. Tx connection parameters that * operate in TCP sequence space are affected by the HW additions and need to * compensate for them to accurately track TCP sequence numbers. This array * contains the compensating extra lengths for ULP packets. It is indexed by * a packet's ULP submode. */ const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; #ifdef notyet /* * This sk_buff holds a fake header-only TCP segment that we use whenever we * need to exploit SW TCP functionality that expects TCP headers, such as * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple * CPUs without locking. */ static struct mbuf *tcphdr_mbuf __read_mostly; #endif /* * Size of WRs in bytes. Note that we assume all devices we are handling have * the same WR size. */ static unsigned int wrlen __read_mostly; /* * The number of WRs needed for an skb depends on the number of page fragments * in the skb and whether it has any payload in its main body. This maps the * length of the gather list represented by an skb into the # of necessary WRs. */ static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly; /* * Max receive window supported by HW in bytes. Only a small part of it can * be set through option0, the rest needs to be set through RX_DATA_ACK. */ #define MAX_RCV_WND ((1U << 27) - 1) /* * Min receive window. We want it to be large enough to accommodate receive * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. */ #define MIN_RCV_WND (24 * 1024U) #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) #define VALIDATE_SEQ 0 #define VALIDATE_SOCK(so) #define DEBUG_WR 0 #define TCP_TIMEWAIT 1 #define TCP_CLOSE 2 #define TCP_DROP 3 -extern int tcp_do_autorcvbuf; -extern int tcp_do_autosndbuf; -extern int tcp_autorcvbuf_max; -extern int tcp_autosndbuf_max; - static void t3_send_reset(struct toepcb *toep); static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); static inline void free_atid(struct t3cdev *cdev, unsigned int tid); static void handle_syncache_event(int event, void *arg); static inline void SBAPPEND(struct sockbuf *sb, struct mbuf *n) { struct mbuf *m; m = sb->sb_mb; while (m) { KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", m->m_next, m->m_nextpkt, m->m_flags)); m = m->m_next; } m = n; while (m) { KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", m->m_next, m->m_nextpkt, m->m_flags)); m = m->m_next; } KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set")); sbappendstream_locked(sb, n); m = sb->sb_mb; while (m) { KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", m->m_next, m->m_nextpkt, m->m_flags)); m = m->m_next; } } static inline int is_t3a(const struct toedev *dev) { return (dev->tod_ttid == TOE_ID_CHELSIO_T3); } static void dump_toepcb(struct toepcb *toep) { DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, toep->tp_mtu_idx, toep->tp_tid); DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, toep->tp_mss_clamp, toep->tp_flags); } #ifndef RTALLOC2_DEFINED static struct rtentry * rtalloc2(struct sockaddr *dst, int report, u_long ignflags) { struct rtentry *rt = NULL; if ((rt = rtalloc1(dst, report, ignflags)) != NULL) RT_UNLOCK(rt); return (rt); } #endif /* * Determine whether to send a CPL message now or defer it. A message is * deferred if the connection is in SYN_SENT since we don't know the TID yet. * For connections in other states the message is sent immediately. * If through_l2t is set the message is subject to ARP processing, otherwise * it is sent directly. */ static inline void send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t) { struct tcpcb *tp = toep->tp_tp; if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { inp_wlock(tp->t_inpcb); mbufq_tail(&toep->out_of_order_queue, m); // defer inp_wunlock(tp->t_inpcb); } else if (through_l2t) l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T else cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly } static inline unsigned int mkprio(unsigned int cntrl, const struct toepcb *toep) { return (cntrl); } /* * Populate a TID_RELEASE WR. The skb must be already propely sized. */ static inline void mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid) { struct cpl_tid_release *req; m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep)); m->m_pkthdr.len = m->m_len = sizeof(*req); req = mtod(m, struct cpl_tid_release *); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); req->wr.wr_lo = 0; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); } static inline void make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) { INIT_VNET_INET(so->so_vnet); struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; struct tx_data_wr *req; struct sockbuf *snd; inp_lock_assert(tp->t_inpcb); snd = so_sockbuf_snd(so); req = mtod(m, struct tx_data_wr *); m->m_len = sizeof(*req); req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); /* len includes the length of any HW ULP additions */ req->len = htonl(len); req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); /* V_TX_ULP_SUBMODE sets both the mode and submode */ req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | V_TX_URG(/* skb_urgent(skb) */ 0 ) | V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && (tail ? 0 : 1)))); req->sndseq = htonl(tp->snd_nxt); if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | V_TX_CPU_IDX(toep->tp_qset)); /* Sendbuffer is in units of 32KB. */ if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15)); else { req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); } toep->tp_flags |= TP_DATASENT; } } #define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */ int t3_push_frames(struct socket *so, int req_completion) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; struct mbuf *tail, *m0, *last; struct t3cdev *cdev; struct tom_data *d; int state, bytes, count, total_bytes; bus_dma_segment_t segs[TX_MAX_SEGS], *segp; struct sockbuf *snd; if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { DPRINTF("tcp state=%d\n", tp->t_state); return (0); } state = so_state_get(so); if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { DPRINTF("disconnecting\n"); return (0); } inp_lock_assert(tp->t_inpcb); snd = so_sockbuf_snd(so); sockbuf_lock(snd); d = TOM_DATA(toep->tp_toedev); cdev = d->cdev; last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; total_bytes = 0; DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last); if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) { KASSERT(tail, ("sbdrop error")); last = tail = tail->m_next; } if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); sockbuf_unlock(snd); return (0); } toep->tp_m_last = NULL; while (toep->tp_wr_avail && (tail != NULL)) { count = bytes = 0; segp = segs; if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { sockbuf_unlock(snd); return (0); } /* * If the data in tail fits as in-line, then * make an immediate data wr. */ if (tail->m_len <= IMM_LEN) { count = 1; bytes = tail->m_len; last = tail; tail = tail->m_next; m_set_sgl(m0, NULL); m_set_sgllen(m0, 0); make_tx_data_wr(so, m0, bytes, tail); m_append(m0, bytes, mtod(last, caddr_t)); KASSERT(!m0->m_next, ("bad append")); } else { while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) && (tail != NULL) && (count < TX_MAX_SEGS-1)) { bytes += tail->m_len; last = tail; count++; /* * technically an abuse to be using this for a VA * but less gross than defining my own structure * or calling pmap_kextract from here :-| */ segp->ds_addr = (bus_addr_t)tail->m_data; segp->ds_len = tail->m_len; DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", count, mbuf_wrs[count], tail->m_data, tail->m_len); segp++; tail = tail->m_next; } DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", toep->tp_wr_avail, count, mbuf_wrs[count], tail); m_set_sgl(m0, segs); m_set_sgllen(m0, count); make_tx_data_wr(so, m0, bytes, tail); } m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep)); if (tail) { snd->sb_sndptr = tail; toep->tp_m_last = NULL; } else toep->tp_m_last = snd->sb_sndptr = last; DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); snd->sb_sndptroff += bytes; total_bytes += bytes; toep->tp_write_seq += bytes; CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d" " tail=%p sndptr=%p sndptroff=%d", toep->tp_wr_avail, count, mbuf_wrs[count], tail, snd->sb_sndptr, snd->sb_sndptroff); if (tail) CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d" " tp_m_last=%p tailbuf=%p snd_una=0x%08x", total_bytes, toep->tp_m_last, tail->m_data, tp->snd_una); else CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d" " tp_m_last=%p snd_una=0x%08x", total_bytes, toep->tp_m_last, tp->snd_una); #ifdef KTR { int i; i = 0; while (i < count && m_get_sgllen(m0)) { if ((count - i) >= 3) { CTR6(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" " len=%d pa=0x%zx len=%d", segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len, segs[i + 2].ds_addr, segs[i + 2].ds_len); i += 3; } else if ((count - i) == 2) { CTR4(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" " len=%d", segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len); i += 2; } else { CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d", segs[i].ds_addr, segs[i].ds_len); i++; } } } #endif /* * remember credits used */ m0->m_pkthdr.csum_data = mbuf_wrs[count]; m0->m_pkthdr.len = bytes; toep->tp_wr_avail -= mbuf_wrs[count]; toep->tp_wr_unacked += mbuf_wrs[count]; if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || toep->tp_wr_unacked >= toep->tp_wr_max / 2) { struct work_request_hdr *wr = cplhdr(m0); wr->wr_hi |= htonl(F_WR_COMPL); toep->tp_wr_unacked = 0; } KASSERT((m0->m_pkthdr.csum_data > 0) && (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d", m0->m_pkthdr.csum_data)); m0->m_type = MT_DONTFREE; enqueue_wr(toep, m0); DPRINTF("sending offload tx with %d bytes in %d segments\n", bytes, count); l2t_send(cdev, m0, toep->tp_l2t); } sockbuf_unlock(snd); return (total_bytes); } /* * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail * under any circumstances. We take the easy way out and always queue the * message to the write_queue. We can optimize the case where the queue is * already empty though the optimization is probably not worth it. */ static void close_conn(struct socket *so) { struct mbuf *m; struct cpl_close_con_req *req; struct tom_data *d; struct inpcb *inp = so_sotoinpcb(so); struct tcpcb *tp; struct toepcb *toep; unsigned int tid; inp_wlock(inp); tp = so_sototcpcb(so); toep = tp->t_toe; if (tp->t_state != TCPS_SYN_SENT) t3_push_frames(so, 1); if (toep->tp_flags & TP_FIN_SENT) { inp_wunlock(inp); return; } tid = toep->tp_tid; d = TOM_DATA(toep->tp_toedev); m = m_gethdr_nofail(sizeof(*req)); m_set_priority(m, CPL_PRIORITY_DATA); m_set_sgl(m, NULL); m_set_sgllen(m, 0); toep->tp_flags |= TP_FIN_SENT; req = mtod(m, struct cpl_close_con_req *); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); req->wr.wr_lo = htonl(V_WR_TID(tid)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); req->rsvd = 0; inp_wunlock(inp); /* * XXX - need to defer shutdown while there is still data in the queue * */ CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid); cxgb_ofld_send(d->cdev, m); } /* * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant * and send it along. */ static void abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) { struct cpl_abort_req *req = cplhdr(m); req->cmd = CPL_ABORT_NO_RST; cxgb_ofld_send(cdev, m); } /* * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are * permitted to return without sending the message in case we cannot allocate * an sk_buff. Returns the number of credits sent. */ uint32_t t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) { struct mbuf *m; struct cpl_rx_data_ack *req; struct toepcb *toep = tp->t_toe; struct toedev *tdev = toep->tp_toedev; m = m_gethdr_nofail(sizeof(*req)); DPRINTF("returning %u credits to HW\n", credits); req = mtod(m, struct cpl_rx_data_ack *); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); req->wr.wr_lo = 0; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); return (credits); } /* * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled. * This is only used in DDP mode, so we take the opportunity to also set the * DACK mode and flush any Rx credits. */ void t3_send_rx_modulate(struct toepcb *toep) { struct mbuf *m; struct cpl_rx_data_ack *req; m = m_gethdr_nofail(sizeof(*req)); req = mtod(m, struct cpl_rx_data_ack *); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); req->wr.wr_lo = 0; m->m_pkthdr.len = m->m_len = sizeof(*req); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | V_RX_DACK_MODE(1) | V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup)); m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); cxgb_ofld_send(TOEP_T3C_DEV(toep), m); toep->tp_rcv_wup = toep->tp_copied_seq; } /* * Handle receipt of an urgent pointer. */ static void handle_urg_ptr(struct socket *so, uint32_t urg_seq) { #ifdef URGENT_DATA_SUPPORTED struct tcpcb *tp = so_sototcpcb(so); urg_seq--; /* initially points past the urgent data, per BSD */ if (tp->urg_data && !after(urg_seq, tp->urg_seq)) return; /* duplicate pointer */ sk_send_sigurg(sk); if (tp->urg_seq == tp->copied_seq && tp->urg_data && !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); tp->copied_seq++; if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len) tom_eat_skb(sk, skb, 0); } tp->urg_data = TCP_URG_NOTYET; tp->urg_seq = urg_seq; #endif } /* * Returns true if a socket cannot accept new Rx data. */ static inline int so_no_receive(const struct socket *so) { return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING)); } /* * Process an urgent data notification. */ static void rx_urg_notify(struct toepcb *toep, struct mbuf *m) { struct cpl_rx_urg_notify *hdr = cplhdr(m); struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); VALIDATE_SOCK(so); if (!so_no_receive(so)) handle_urg_ptr(so, ntohl(hdr->seq)); m_freem(m); } /* * Handler for RX_URG_NOTIFY CPL messages. */ static int do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct toepcb *toep = (struct toepcb *)ctx; rx_urg_notify(toep, m); return (0); } static __inline int is_delack_mode_valid(struct toedev *dev, struct toepcb *toep) { return (toep->tp_ulp_mode || (toep->tp_ulp_mode == ULP_MODE_TCPDDP && dev->tod_ttid >= TOE_ID_CHELSIO_T3)); } /* * Set of states for which we should return RX credits. */ #define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) /* * Called after some received data has been read. It returns RX credits * to the HW for the amount of data processed. */ void t3_cleanup_rbuf(struct tcpcb *tp, int copied) { struct toepcb *toep = tp->t_toe; struct socket *so; struct toedev *dev; int dack_mode, must_send, read; u32 thres, credits, dack = 0; struct sockbuf *rcv; so = inp_inpcbtosocket(tp->t_inpcb); rcv = so_sockbuf_rcv(so); if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || (tp->t_state == TCPS_FIN_WAIT_2))) { if (copied) { sockbuf_lock(rcv); toep->tp_copied_seq += copied; sockbuf_unlock(rcv); } return; } inp_lock_assert(tp->t_inpcb); sockbuf_lock(rcv); if (copied) toep->tp_copied_seq += copied; else { read = toep->tp_enqueued_bytes - rcv->sb_cc; toep->tp_copied_seq += read; } credits = toep->tp_copied_seq - toep->tp_rcv_wup; toep->tp_enqueued_bytes = rcv->sb_cc; sockbuf_unlock(rcv); if (credits > rcv->sb_mbmax) { log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n", toep->tp_copied_seq, toep->tp_rcv_wup, credits); credits = rcv->sb_mbmax; } /* * XXX this won't accurately reflect credit return - we need * to look at the difference between the amount that has been * put in the recv sockbuf and what is there now */ if (__predict_false(!credits)) return; dev = toep->tp_toedev; thres = TOM_TUNABLE(dev, rx_credit_thres); if (__predict_false(thres == 0)) return; if (is_delack_mode_valid(dev, toep)) { dack_mode = TOM_TUNABLE(dev, delack); if (__predict_false(dack_mode != toep->tp_delack_mode)) { u32 r = tp->rcv_nxt - toep->tp_delack_seq; if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(dack_mode); } } else dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); /* * For coalescing to work effectively ensure the receive window has * at least 16KB left. */ must_send = credits + 16384 >= tp->rcv_wnd; if (must_send || credits >= thres) toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); } static int cxgb_toe_disconnect(struct tcpcb *tp) { struct socket *so; DPRINTF("cxgb_toe_disconnect\n"); so = inp_inpcbtosocket(tp->t_inpcb); close_conn(so); return (0); } static int cxgb_toe_reset(struct tcpcb *tp) { struct toepcb *toep = tp->t_toe; t3_send_reset(toep); /* * unhook from socket */ tp->t_flags &= ~TF_TOE; toep->tp_tp = NULL; tp->t_toe = NULL; return (0); } static int cxgb_toe_send(struct tcpcb *tp) { struct socket *so; DPRINTF("cxgb_toe_send\n"); dump_toepcb(tp->t_toe); so = inp_inpcbtosocket(tp->t_inpcb); t3_push_frames(so, 1); return (0); } static int cxgb_toe_rcvd(struct tcpcb *tp) { inp_lock_assert(tp->t_inpcb); t3_cleanup_rbuf(tp, 0); return (0); } static void cxgb_toe_detach(struct tcpcb *tp) { struct toepcb *toep; /* * XXX how do we handle teardown in the SYN_SENT state? * */ inp_lock_assert(tp->t_inpcb); toep = tp->t_toe; toep->tp_tp = NULL; /* * unhook from socket */ tp->t_flags &= ~TF_TOE; tp->t_toe = NULL; } static struct toe_usrreqs cxgb_toe_usrreqs = { .tu_disconnect = cxgb_toe_disconnect, .tu_reset = cxgb_toe_reset, .tu_send = cxgb_toe_send, .tu_rcvd = cxgb_toe_rcvd, .tu_detach = cxgb_toe_detach, .tu_detach = cxgb_toe_detach, .tu_syncache_event = handle_syncache_event, }; static void __set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word, uint64_t mask, uint64_t val, int no_reply) { struct cpl_set_tcb_field *req; CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", toep->tp_tid, word, mask, val); req = mtod(m, struct cpl_set_tcb_field *); m->m_pkthdr.len = m->m_len = sizeof(*req); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); req->wr.wr_lo = 0; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); req->reply = V_NO_REPLY(no_reply); req->cpu_idx = 0; req->word = htons(word); req->mask = htobe64(mask); req->val = htobe64(val); m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); send_or_defer(toep, m, 0); } static void t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) { struct mbuf *m; struct tcpcb *tp = toep->tp_tp; if (toep == NULL) return; if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) { printf("not seting field\n"); return; } m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); __set_tcb_field(toep, m, word, mask, val, 1); } /* * Set one of the t_flags bits in the TCB. */ static void set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val) { t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); } /* * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. */ static void t3_set_nagle(struct toepcb *toep) { struct tcpcb *tp = toep->tp_tp; set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); } /* * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. */ void t3_set_keepalive(struct toepcb *toep, int on_off) { set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off); } void t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off) { set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off); } void t3_set_dack_mss(struct toepcb *toep, int on_off) { set_tcb_tflag(toep, S_TF_DACK_MSS, on_off); } /* * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. */ static void t3_set_tos(struct toepcb *toep) { int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb); t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), V_TCB_TOS(tos)); } /* * In DDP mode, TP fails to schedule a timer to push RX data to the host when * DDP is disabled (data is delivered to freelist). [Note that, the peer should * set the PSH bit in the last segment, which would trigger delivery.] * We work around the issue by setting a DDP buffer in a partial placed state, * which guarantees that TP will schedule a timer. */ #define TP_DDP_TIMER_WORKAROUND_MASK\ (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) #define TP_DDP_TIMER_WORKAROUND_VAL\ (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ 32)) static void t3_enable_ddp(struct toepcb *toep, int on) { if (on) { t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), V_TF_DDP_OFF(0)); } else t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1) | TP_DDP_TIMER_WORKAROUND_MASK, V_TF_DDP_OFF(1) | TP_DDP_TIMER_WORKAROUND_VAL); } void t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color) { t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx, V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), tag_color); } void t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset, unsigned int len) { if (buf_idx == 0) t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET, V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); else t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET, V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); } static int t3_set_cong_control(struct socket *so, const char *name) { #ifdef CONGESTION_CONTROL_SUPPORTED int cong_algo; for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) if (!strcmp(name, t3_cong_ops[cong_algo].name)) break; if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) return -EINVAL; #endif return 0; } int t3_get_tcb(struct toepcb *toep) { struct cpl_get_tcb *req; struct tcpcb *tp = toep->tp_tp; struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) return (ENOMEM); inp_lock_assert(tp->t_inpcb); m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); req = mtod(m, struct cpl_get_tcb *); m->m_pkthdr.len = m->m_len = sizeof(*req); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); req->wr.wr_lo = 0; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); req->cpuno = htons(toep->tp_qset); req->rsvd = 0; if (tp->t_state == TCPS_SYN_SENT) mbufq_tail(&toep->out_of_order_queue, m); // defer else cxgb_ofld_send(TOEP_T3C_DEV(toep), m); return 0; } static inline void so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid) { toepcb_hold(toep); cxgb_insert_tid(d->cdev, d->client, toep, tid); } /** * find_best_mtu - find the entry in the MTU table closest to an MTU * @d: TOM state * @mtu: the target MTU * * Returns the index of the value in the MTU table that is closest to but * does not exceed the target MTU. */ static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu) { int i = 0; while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) ++i; return (i); } static unsigned int select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) { unsigned int idx; #ifdef notyet struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt; #endif if (tp) { tp->t_maxseg = pmtu - 40; if (tp->t_maxseg < td->mtus[0] - 40) tp->t_maxseg = td->mtus[0] - 40; idx = find_best_mtu(td, tp->t_maxseg + 40); tp->t_maxseg = td->mtus[idx] - 40; } else idx = find_best_mtu(td, pmtu); return (idx); } static inline void free_atid(struct t3cdev *cdev, unsigned int tid) { struct toepcb *toep = cxgb_free_atid(cdev, tid); if (toep) toepcb_release(toep); } /* * Release resources held by an offload connection (TID, L2T entry, etc.) */ static void t3_release_offload_resources(struct toepcb *toep) { struct tcpcb *tp = toep->tp_tp; struct toedev *tdev = toep->tp_toedev; struct t3cdev *cdev; struct socket *so; unsigned int tid = toep->tp_tid; struct sockbuf *rcv; CTR0(KTR_TOM, "t3_release_offload_resources"); if (!tdev) return; cdev = TOEP_T3C_DEV(toep); if (!cdev) return; toep->tp_qset = 0; t3_release_ddp_resources(toep); #ifdef CTRL_SKB_CACHE kfree_skb(CTRL_SKB_CACHE(tp)); CTRL_SKB_CACHE(tp) = NULL; #endif if (toep->tp_wr_avail != toep->tp_wr_max) { purge_wr_queue(toep); reset_wr_list(toep); } if (toep->tp_l2t) { l2t_release(L2DATA(cdev), toep->tp_l2t); toep->tp_l2t = NULL; } toep->tp_tp = NULL; if (tp) { inp_lock_assert(tp->t_inpcb); so = inp_inpcbtosocket(tp->t_inpcb); rcv = so_sockbuf_rcv(so); /* * cancel any offloaded reads * */ sockbuf_lock(rcv); tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; if (toep->tp_ddp_state.user_ddp_pending) { t3_cancel_ubuf(toep, rcv); toep->tp_ddp_state.user_ddp_pending = 0; } so_sorwakeup_locked(so); } if (toep->tp_state == TCPS_SYN_SENT) { free_atid(cdev, tid); #ifdef notyet __skb_queue_purge(&tp->out_of_order_queue); #endif } else { // we have TID cxgb_remove_tid(cdev, toep, tid); toepcb_release(toep); } #if 0 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); #endif } static void install_offload_ops(struct socket *so) { struct tcpcb *tp = so_sototcpcb(so); KASSERT(tp->t_toe != NULL, ("toepcb not set")); t3_install_socket_ops(so); tp->t_flags |= TF_TOE; tp->t_tu = &cxgb_toe_usrreqs; } /* * Determine the receive window scaling factor given a target max * receive window. */ static __inline int select_rcv_wscale(int space) { INIT_VNET_INET(so->so_vnet); int wscale = 0; if (space > MAX_RCV_WND) space = MAX_RCV_WND; if (V_tcp_do_rfc1323) for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; return (wscale); } /* * Determine the receive window size for a socket. */ static unsigned long select_rcv_wnd(struct toedev *dev, struct socket *so) { INIT_VNET_INET(so->so_vnet); struct tom_data *d = TOM_DATA(dev); unsigned int wnd; unsigned int max_rcv_wnd; struct sockbuf *rcv; rcv = so_sockbuf_rcv(so); if (V_tcp_do_autorcvbuf) wnd = V_tcp_autorcvbuf_max; else wnd = rcv->sb_hiwat; /* XXX * For receive coalescing to work effectively we need a receive window * that can accomodate a coalesced segment. */ if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; /* PR 5138 */ max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? (uint32_t)d->rx_page_size * 23 : MAX_RCV_WND); return min(wnd, max_rcv_wnd); } /* * Assign offload parameters to some socket fields. This code is used by * both active and passive opens. */ static inline void init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) { struct tcpcb *tp = so_sototcpcb(so); struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); struct sockbuf *snd, *rcv; #ifdef notyet SOCK_LOCK_ASSERT(so); #endif snd = so_sockbuf_snd(so); rcv = so_sockbuf_rcv(so); log(LOG_INFO, "initializing offload socket\n"); /* * We either need to fix push frames to work with sbcompress * or we need to add this */ snd->sb_flags |= SB_NOCOALESCE; rcv->sb_flags |= SB_NOCOALESCE; tp->t_toe = toep; toep->tp_tp = tp; toep->tp_toedev = dev; toep->tp_tid = tid; toep->tp_l2t = e; toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); toep->tp_wr_unacked = 0; toep->tp_delack_mode = 0; toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); /* * XXX broken * */ tp->rcv_wnd = select_rcv_wnd(dev, so); toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) && tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; toep->tp_qset_idx = 0; reset_wr_list(toep); DPRINTF("initialization done\n"); } /* * The next two functions calculate the option 0 value for a socket. */ static inline unsigned int calc_opt0h(struct socket *so, int mtu_idx) { struct tcpcb *tp = so_sototcpcb(so); int wscale = select_rcv_wscale(tp->rcv_wnd); return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); } static inline unsigned int calc_opt0l(struct socket *so, int ulp_mode) { struct tcpcb *tp = so_sototcpcb(so); unsigned int val; val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val); return (val); } static inline unsigned int calc_opt2(const struct socket *so, struct toedev *dev) { int flv_valid; flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); return (V_FLAVORS_VALID(flv_valid) | V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0)); } #if DEBUG_WR > 1 static int count_pending_wrs(const struct toepcb *toep) { const struct mbuf *m; int n = 0; wr_queue_walk(toep, m) n += m->m_pkthdr.csum_data; return (n); } #endif #if 0 (((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) #endif static void mk_act_open_req(struct socket *so, struct mbuf *m, unsigned int atid, const struct l2t_entry *e) { struct cpl_act_open_req *req; struct inpcb *inp = so_sotoinpcb(so); struct tcpcb *tp = inp_inpcbtotcpcb(inp); struct toepcb *toep = tp->t_toe; struct toedev *tdev = toep->tp_toedev; m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep)); req = mtod(m, struct cpl_act_open_req *); m->m_pkthdr.len = m->m_len = sizeof(*req); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); req->wr.wr_lo = 0; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port); #if 0 req->local_port = inp->inp_lport; req->peer_port = inp->inp_fport; memcpy(&req->local_ip, &inp->inp_laddr, 4); memcpy(&req->peer_ip, &inp->inp_faddr, 4); #endif req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); req->params = 0; req->opt2 = htonl(calc_opt2(so, tdev)); } /* * Convert an ACT_OPEN_RPL status to an errno. */ static int act_open_rpl_status_to_errno(int status) { switch (status) { case CPL_ERR_CONN_RESET: return (ECONNREFUSED); case CPL_ERR_ARP_MISS: return (EHOSTUNREACH); case CPL_ERR_CONN_TIMEDOUT: return (ETIMEDOUT); case CPL_ERR_TCAM_FULL: return (ENOMEM); case CPL_ERR_CONN_EXIST: log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); return (EADDRINUSE); default: return (EIO); } } static void fail_act_open(struct toepcb *toep, int errno) { struct tcpcb *tp = toep->tp_tp; t3_release_offload_resources(toep); if (tp) { inp_wunlock(tp->t_inpcb); tcp_offload_drop(tp, errno); } #ifdef notyet TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); #endif } /* * Handle active open failures. */ static void active_open_failed(struct toepcb *toep, struct mbuf *m) { struct cpl_act_open_rpl *rpl = cplhdr(m); struct inpcb *inp; if (toep->tp_tp == NULL) goto done; inp = toep->tp_tp->t_inpcb; /* * Don't handle connection retry for now */ #ifdef notyet struct inet_connection_sock *icsk = inet_csk(sk); if (rpl->status == CPL_ERR_CONN_EXIST && icsk->icsk_retransmit_timer.function != act_open_retry_timer) { icsk->icsk_retransmit_timer.function = act_open_retry_timer; sk_reset_timer(so, &icsk->icsk_retransmit_timer, jiffies + HZ / 2); } else #endif { inp_wlock(inp); /* * drops the inpcb lock */ fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); } done: m_free(m); } /* * Return whether a failed active open has allocated a TID */ static inline int act_open_has_tid(int status) { return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && status != CPL_ERR_ARP_MISS; } /* * Process an ACT_OPEN_RPL CPL message. */ static int do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct toepcb *toep = (struct toepcb *)ctx; struct cpl_act_open_rpl *rpl = cplhdr(m); if (cdev->type != T3A && act_open_has_tid(rpl->status)) cxgb_queue_tid_release(cdev, GET_TID(rpl)); active_open_failed(toep, m); return (0); } /* * Handle an ARP failure for an active open. XXX purge ofo queue * * XXX badly broken for crossed SYNs as the ATID is no longer valid. * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't * free the atid. Hmm. */ #ifdef notyet static void act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) { struct toepcb *toep = m_get_toep(m); struct tcpcb *tp = toep->tp_tp; struct inpcb *inp = tp->t_inpcb; struct socket *so; inp_wlock(inp); if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { /* * drops the inpcb lock */ fail_act_open(so, EHOSTUNREACH); printf("freeing %p\n", m); m_free(m); } else inp_wunlock(inp); } #endif /* * Send an active open request. */ int t3_connect(struct toedev *tdev, struct socket *so, struct rtentry *rt, struct sockaddr *nam) { struct mbuf *m; struct l2t_entry *e; struct tom_data *d = TOM_DATA(tdev); struct inpcb *inp = so_sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep; /* allocated by init_offload_socket */ int atid; toep = toepcb_alloc(); if (toep == NULL) goto out_err; if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) goto out_err; e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); if (!e) goto free_tid; inp_lock_assert(inp); m = m_gethdr(MT_DATA, M_WAITOK); #if 0 m->m_toe.mt_toepcb = tp->t_toe; set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); #endif so_lock(so); init_offload_socket(so, tdev, atid, e, rt, toep); install_offload_ops(so); mk_act_open_req(so, m, atid, e); so_unlock(so); soisconnecting(so); toep = tp->t_toe; m_set_toep(m, tp->t_toe); toep->tp_state = TCPS_SYN_SENT; l2t_send(d->cdev, (struct mbuf *)m, e); if (toep->tp_ulp_mode) t3_enable_ddp(toep, 0); return (0); free_tid: printf("failing connect - free atid\n"); free_atid(d->cdev, atid); out_err: printf("return ENOMEM\n"); return (ENOMEM); } /* * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do * not send multiple ABORT_REQs for the same connection and also that we do * not try to send a message after the connection has closed. Returns 1 if * an ABORT_REQ wasn't generated after all, 0 otherwise. */ static void t3_send_reset(struct toepcb *toep) { struct cpl_abort_req *req; unsigned int tid = toep->tp_tid; int mode = CPL_ABORT_SEND_RST; struct tcpcb *tp = toep->tp_tp; struct toedev *tdev = toep->tp_toedev; struct socket *so = NULL; struct mbuf *m; struct sockbuf *snd; if (tp) { inp_lock_assert(tp->t_inpcb); so = inp_inpcbtosocket(tp->t_inpcb); } if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || tdev == NULL)) return; toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); snd = so_sockbuf_snd(so); /* Purge the send queue so we don't send anything after an abort. */ if (so) sbflush(snd); if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) mode |= CPL_ABORT_POST_CLOSE_REQ; m = m_gethdr_nofail(sizeof(*req)); m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep)); set_arp_failure_handler(m, abort_arp_failure); req = mtod(m, struct cpl_abort_req *); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); req->wr.wr_lo = htonl(V_WR_TID(tid)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; req->rsvd1 = !(toep->tp_flags & TP_DATASENT); req->cmd = mode; if (tp && (tp->t_state == TCPS_SYN_SENT)) mbufq_tail(&toep->out_of_order_queue, m); // defer else l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); } static int t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; int error, optval; if (sopt->sopt_name == IP_OPTIONS) return (ENOPROTOOPT); if (sopt->sopt_name != IP_TOS) return (EOPNOTSUPP); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); if (optval > IPTOS_PREC_CRITIC_ECP) return (EINVAL); inp = so_sotoinpcb(so); inp_wlock(inp); inp_ip_tos_set(inp, optval); #if 0 inp->inp_ip_tos = optval; #endif t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe); inp_wunlock(inp); return (0); } static int t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) { int err = 0; size_t copied; if (sopt->sopt_name != TCP_CONGESTION && sopt->sopt_name != TCP_NODELAY) return (EOPNOTSUPP); if (sopt->sopt_name == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; int optlen = sopt->sopt_valsize; struct tcpcb *tp; if (sopt->sopt_dir == SOPT_GET) { KASSERT(0, ("unimplemented")); return (EOPNOTSUPP); } if (optlen < 1) return (EINVAL); err = copyinstr(sopt->sopt_val, name, min(TCP_CA_NAME_MAX - 1, optlen), &copied); if (err) return (err); if (copied < 1) return (EINVAL); tp = so_sototcpcb(so); /* * XXX I need to revisit this */ if ((err = t3_set_cong_control(so, name)) == 0) { #ifdef CONGESTION_CONTROL_SUPPORTED tp->t_cong_control = strdup(name, M_CXGB); #endif } else return (err); } else { int optval, oldval; struct inpcb *inp; struct tcpcb *tp; if (sopt->sopt_dir == SOPT_GET) return (EOPNOTSUPP); err = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (err) return (err); inp = so_sotoinpcb(so); inp_wlock(inp); tp = inp_inpcbtotcpcb(inp); oldval = tp->t_flags; if (optval) tp->t_flags |= TF_NODELAY; else tp->t_flags &= ~TF_NODELAY; inp_wunlock(inp); if (oldval != tp->t_flags && (tp->t_toe != NULL)) t3_set_nagle(tp->t_toe); } return (0); } int t3_ctloutput(struct socket *so, struct sockopt *sopt) { int err; if (sopt->sopt_level != IPPROTO_TCP) err = t3_ip_ctloutput(so, sopt); else err = t3_tcp_ctloutput(so, sopt); if (err != EOPNOTSUPP) return (err); return (tcp_ctloutput(so, sopt)); } /* * Returns true if we need to explicitly request RST when we receive new data * on an RX-closed connection. */ static inline int need_rst_on_excess_rx(const struct toepcb *toep) { return (1); } /* * Handles Rx data that arrives in a state where the socket isn't accepting * new data. */ static void handle_excess_rx(struct toepcb *toep, struct mbuf *m) { if (need_rst_on_excess_rx(toep) && !(toep->tp_flags & TP_ABORT_SHUTDOWN)) t3_send_reset(toep); m_freem(m); } /* * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) * by getting the DDP offset from the TCB. */ static void tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m) { struct ddp_state *q = &toep->tp_ddp_state; struct ddp_buf_state *bsp; struct cpl_get_tcb_rpl *hdr; unsigned int ddp_offset; struct socket *so; struct tcpcb *tp; struct sockbuf *rcv; int state; uint64_t t; __be64 *tcb; tp = toep->tp_tp; so = inp_inpcbtosocket(tp->t_inpcb); inp_lock_assert(tp->t_inpcb); rcv = so_sockbuf_rcv(so); sockbuf_lock(rcv); /* Note that we only accout for CPL_GET_TCB issued by the DDP code. * We really need a cookie in order to dispatch the RPLs. */ q->get_tcb_count--; /* It is a possible that a previous CPL already invalidated UBUF DDP * and moved the cur_buf idx and hence no further processing of this * skb is required. However, the app might be sleeping on * !q->get_tcb_count and we need to wake it up. */ if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) { int state = so_state_get(so); m_freem(m); if (__predict_true((state & SS_NOFDREF) == 0)) so_sorwakeup_locked(so); else sockbuf_unlock(rcv); return; } bsp = &q->buf_state[q->cur_buf]; hdr = cplhdr(m); tcb = (__be64 *)(hdr + 1); if (q->cur_buf == 0) { t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]); ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET); } else { t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]); ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET; } ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET; m->m_cur_offset = bsp->cur_offset; bsp->cur_offset = ddp_offset; m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset; CTR5(KTR_TOM, "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u", q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset); KASSERT(ddp_offset >= m->m_cur_offset, ("ddp_offset=%u less than cur_offset=%u", ddp_offset, m->m_cur_offset)); #if 0 { unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx; t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]); ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS; t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]); rcv_nxt = t >> S_TCB_RCV_NXT; rcv_nxt &= M_TCB_RCV_NXT; t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]); rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET); rx_hdr_offset &= M_TCB_RX_HDR_OFFSET; T3_TRACE2(TIDTB(sk), "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x", ddp_flags, rcv_nxt - rx_hdr_offset); T3_TRACE4(TB(q), "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u", tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf); T3_TRACE3(TB(q), "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u", rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset); T3_TRACE2(TB(q), "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x", q->buf_state[0].flags, q->buf_state[1].flags); } #endif if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) { handle_excess_rx(toep, m); return; } #ifdef T3_TRACE if ((int)m->m_pkthdr.len < 0) { t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len"); } #endif if (bsp->flags & DDP_BF_NOCOPY) { #ifdef T3_TRACE T3_TRACE0(TB(q), "tcb_rpl_as_ddp_complete: CANCEL UBUF"); if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) { printk("!cancel_ubuf"); t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf"); } #endif m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1; bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA); q->cur_buf ^= 1; } else if (bsp->flags & DDP_BF_NOFLIP) { m->m_ddp_flags = 1; /* always a kernel buffer */ /* now HW buffer carries a user buffer */ bsp->flags &= ~DDP_BF_NOFLIP; bsp->flags |= DDP_BF_NOCOPY; /* It is possible that the CPL_GET_TCB_RPL doesn't indicate * any new data in which case we're done. If in addition the * offset is 0, then there wasn't a completion for the kbuf * and we need to decrement the posted count. */ if (m->m_pkthdr.len == 0) { if (ddp_offset == 0) { q->kbuf_posted--; bsp->flags |= DDP_BF_NODATA; } sockbuf_unlock(rcv); m_free(m); return; } } else { sockbuf_unlock(rcv); /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, * but it got here way late and nobody cares anymore. */ m_free(m); return; } m->m_ddp_gl = (unsigned char *)bsp->gl; m->m_flags |= M_DDP; m->m_seq = tp->rcv_nxt; tp->rcv_nxt += m->m_pkthdr.len; tp->t_rcvtime = ticks; CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u", m->m_seq, q->cur_buf, m->m_pkthdr.len); if (m->m_pkthdr.len == 0) { q->user_ddp_pending = 0; m_free(m); } else SBAPPEND(rcv, m); state = so_state_get(so); if (__predict_true((state & SS_NOFDREF) == 0)) so_sorwakeup_locked(so); else sockbuf_unlock(rcv); } /* * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code, * in that case they are similar to DDP completions. */ static int do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct toepcb *toep = (struct toepcb *)ctx; /* OK if socket doesn't exist */ if (toep == NULL) { printf("null toep in do_get_tcb_rpl\n"); return (CPL_RET_BUF_DONE); } inp_wlock(toep->tp_tp->t_inpcb); tcb_rpl_as_ddp_complete(toep, m); inp_wunlock(toep->tp_tp->t_inpcb); return (0); } static void handle_ddp_data(struct toepcb *toep, struct mbuf *m) { struct tcpcb *tp = toep->tp_tp; struct socket *so; struct ddp_state *q; struct ddp_buf_state *bsp; struct cpl_rx_data *hdr = cplhdr(m); unsigned int rcv_nxt = ntohl(hdr->seq); struct sockbuf *rcv; if (tp->rcv_nxt == rcv_nxt) return; inp_lock_assert(tp->t_inpcb); so = inp_inpcbtosocket(tp->t_inpcb); rcv = so_sockbuf_rcv(so); sockbuf_lock(rcv); q = &toep->tp_ddp_state; bsp = &q->buf_state[q->cur_buf]; KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x", rcv_nxt, tp->rcv_nxt)); m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d", rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len); #ifdef T3_TRACE if ((int)m->m_pkthdr.len < 0) { t3_ddp_error(so, "handle_ddp_data: neg len"); } #endif m->m_ddp_gl = (unsigned char *)bsp->gl; m->m_flags |= M_DDP; m->m_cur_offset = bsp->cur_offset; m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; if (bsp->flags & DDP_BF_NOCOPY) bsp->flags &= ~DDP_BF_NOCOPY; m->m_seq = tp->rcv_nxt; tp->rcv_nxt = rcv_nxt; bsp->cur_offset += m->m_pkthdr.len; if (!(bsp->flags & DDP_BF_NOFLIP)) q->cur_buf ^= 1; /* * For now, don't re-enable DDP after a connection fell out of DDP * mode. */ q->ubuf_ddp_ready = 0; sockbuf_unlock(rcv); } /* * Process new data received for a connection. */ static void new_rx_data(struct toepcb *toep, struct mbuf *m) { struct cpl_rx_data *hdr = cplhdr(m); struct tcpcb *tp = toep->tp_tp; struct socket *so; struct sockbuf *rcv; int state; int len = be16toh(hdr->len); inp_wlock(tp->t_inpcb); so = inp_inpcbtosocket(tp->t_inpcb); if (__predict_false(so_no_receive(so))) { handle_excess_rx(toep, m); inp_wunlock(tp->t_inpcb); TRACE_EXIT; return; } if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) handle_ddp_data(toep, m); m->m_seq = ntohl(hdr->seq); m->m_ulp_mode = 0; /* for iSCSI */ #if VALIDATE_SEQ if (__predict_false(m->m_seq != tp->rcv_nxt)) { log(LOG_ERR, "%s: TID %u: Bad sequence number %u, expected %u\n", toep->tp_toedev->name, toep->tp_tid, m->m_seq, tp->rcv_nxt); m_freem(m); inp_wunlock(tp->t_inpcb); return; } #endif m_adj(m, sizeof(*hdr)); #ifdef URGENT_DATA_SUPPORTED /* * We don't handle urgent data yet */ if (__predict_false(hdr->urg)) handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); if (__predict_false(tp->urg_data == TCP_URG_NOTYET && tp->urg_seq - tp->rcv_nxt < skb->len)) tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - tp->rcv_nxt]; #endif if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { toep->tp_delack_mode = hdr->dack_mode; toep->tp_delack_seq = tp->rcv_nxt; } CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d", m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes); if (len < m->m_pkthdr.len) m->m_pkthdr.len = m->m_len = len; tp->rcv_nxt += m->m_pkthdr.len; tp->t_rcvtime = ticks; toep->tp_enqueued_bytes += m->m_pkthdr.len; CTR2(KTR_TOM, "new_rx_data: seq 0x%x len %u", m->m_seq, m->m_pkthdr.len); inp_wunlock(tp->t_inpcb); rcv = so_sockbuf_rcv(so); sockbuf_lock(rcv); #if 0 if (sb_notify(rcv)) DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len); #endif SBAPPEND(rcv, m); #ifdef notyet /* * We're giving too many credits to the card - but disable this check so we can keep on moving :-| * */ KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1), ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", so, rcv->sb_cc, rcv->sb_mbmax)); #endif CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d", rcv->sb_cc, rcv->sb_mbcnt); state = so_state_get(so); if (__predict_true((state & SS_NOFDREF) == 0)) so_sorwakeup_locked(so); else sockbuf_unlock(rcv); } /* * Handler for RX_DATA CPL messages. */ static int do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct toepcb *toep = (struct toepcb *)ctx; DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); new_rx_data(toep, m); return (0); } static void new_rx_data_ddp(struct toepcb *toep, struct mbuf *m) { struct tcpcb *tp; struct ddp_state *q; struct ddp_buf_state *bsp; struct cpl_rx_data_ddp *hdr; struct socket *so; unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; int nomoredata = 0; unsigned int delack_mode; struct sockbuf *rcv; tp = toep->tp_tp; inp_wlock(tp->t_inpcb); so = inp_inpcbtosocket(tp->t_inpcb); if (__predict_false(so_no_receive(so))) { handle_excess_rx(toep, m); inp_wunlock(tp->t_inpcb); return; } q = &toep->tp_ddp_state; hdr = cplhdr(m); ddp_report = ntohl(hdr->u.ddp_report); buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; bsp = &q->buf_state[buf_idx]; CTR4(KTR_TOM, "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " "hdr seq 0x%x len %u", tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), ntohs(hdr->len)); CTR3(KTR_TOM, "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d", G_DDP_OFFSET(ddp_report), ddp_report, buf_idx); ddp_len = ntohs(hdr->len); rcv_nxt = ntohl(hdr->seq) + ddp_len; delack_mode = G_DDP_DACK_MODE(ddp_report); if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { toep->tp_delack_mode = delack_mode; toep->tp_delack_seq = tp->rcv_nxt; } m->m_seq = tp->rcv_nxt; tp->rcv_nxt = rcv_nxt; tp->t_rcvtime = ticks; /* * Store the length in m->m_len. We are changing the meaning of * m->m_len here, we need to be very careful that nothing from now on * interprets ->len of this packet the usual way. */ m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq; inp_wunlock(tp->t_inpcb); CTR3(KTR_TOM, "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ", m->m_len, rcv_nxt, m->m_seq); /* * Figure out where the new data was placed in the buffer and store it * in when. Assumes the buffer offset starts at 0, consumer needs to * account for page pod's pg_offset. */ end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; m->m_cur_offset = end_offset - m->m_pkthdr.len; rcv = so_sockbuf_rcv(so); sockbuf_lock(rcv); m->m_ddp_gl = (unsigned char *)bsp->gl; m->m_flags |= M_DDP; bsp->cur_offset = end_offset; toep->tp_enqueued_bytes += m->m_pkthdr.len; /* * Length is only meaningful for kbuf */ if (!(bsp->flags & DDP_BF_NOCOPY)) KASSERT(m->m_len <= bsp->gl->dgl_length, ("length received exceeds ddp pages: len=%d dgl_length=%d", m->m_len, bsp->gl->dgl_length)); KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next)); /* * Bit 0 of flags stores whether the DDP buffer is completed. * Note that other parts of the code depend on this being in bit 0. */ if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { panic("spurious ddp completion"); } else { m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) q->cur_buf ^= 1; /* flip buffers */ } if (bsp->flags & DDP_BF_NOCOPY) { m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY); bsp->flags &= ~DDP_BF_NOCOPY; } if (ddp_report & F_DDP_PSH) m->m_ddp_flags |= DDP_BF_PSH; if (nomoredata) m->m_ddp_flags |= DDP_BF_NODATA; #ifdef notyet skb_reset_transport_header(skb); tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */ #endif SBAPPEND(rcv, m); if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) || (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1)) || !(m->m_ddp_flags & DDP_BF_NOCOPY)))) so_sorwakeup_locked(so); else sockbuf_unlock(rcv); } #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ F_DDP_INVALID_PPOD) /* * Handler for RX_DATA_DDP CPL messages. */ static int do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct toepcb *toep = ctx; const struct cpl_rx_data_ddp *hdr = cplhdr(m); VALIDATE_SOCK(so); if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); return (CPL_RET_BUF_DONE); } #if 0 skb->h.th = tcphdr_skb->h.th; #endif new_rx_data_ddp(toep, m); return (0); } static void process_ddp_complete(struct toepcb *toep, struct mbuf *m) { struct tcpcb *tp = toep->tp_tp; struct socket *so; struct ddp_state *q; struct ddp_buf_state *bsp; struct cpl_rx_ddp_complete *hdr; unsigned int ddp_report, buf_idx, when, delack_mode; int nomoredata = 0; struct sockbuf *rcv; inp_wlock(tp->t_inpcb); so = inp_inpcbtosocket(tp->t_inpcb); if (__predict_false(so_no_receive(so))) { struct inpcb *inp = so_sotoinpcb(so); handle_excess_rx(toep, m); inp_wunlock(inp); return; } q = &toep->tp_ddp_state; hdr = cplhdr(m); ddp_report = ntohl(hdr->ddp_report); buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; m->m_pkthdr.csum_data = tp->rcv_nxt; rcv = so_sockbuf_rcv(so); sockbuf_lock(rcv); bsp = &q->buf_state[buf_idx]; when = bsp->cur_offset; m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when; tp->rcv_nxt += m->m_len; tp->t_rcvtime = ticks; delack_mode = G_DDP_DACK_MODE(ddp_report); if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { toep->tp_delack_mode = delack_mode; toep->tp_delack_seq = tp->rcv_nxt; } #ifdef notyet skb_reset_transport_header(skb); tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ #endif inp_wunlock(tp->t_inpcb); KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); CTR5(KTR_TOM, "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " "ddp_report 0x%x offset %u, len %u", tp->rcv_nxt, bsp->cur_offset, ddp_report, G_DDP_OFFSET(ddp_report), m->m_len); m->m_cur_offset = bsp->cur_offset; bsp->cur_offset += m->m_len; if (!(bsp->flags & DDP_BF_NOFLIP)) { q->cur_buf ^= 1; /* flip buffers */ if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length) nomoredata=1; } CTR4(KTR_TOM, "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " "ddp_report %u offset %u", tp->rcv_nxt, bsp->cur_offset, ddp_report, G_DDP_OFFSET(ddp_report)); m->m_ddp_gl = (unsigned char *)bsp->gl; m->m_flags |= M_DDP; m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; if (bsp->flags & DDP_BF_NOCOPY) bsp->flags &= ~DDP_BF_NOCOPY; if (nomoredata) m->m_ddp_flags |= DDP_BF_NODATA; SBAPPEND(rcv, m); if ((so_state_get(so) & SS_NOFDREF) == 0) so_sorwakeup_locked(so); else sockbuf_unlock(rcv); } /* * Handler for RX_DDP_COMPLETE CPL messages. */ static int do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct toepcb *toep = ctx; VALIDATE_SOCK(so); #if 0 skb->h.th = tcphdr_skb->h.th; #endif process_ddp_complete(toep, m); return (0); } /* * Move a socket to TIME_WAIT state. We need to make some adjustments to the * socket state before calling tcp_time_wait to comply with its expectations. */ static void enter_timewait(struct tcpcb *tp) { /* * Bump rcv_nxt for the peer FIN. We don't do this at the time we * process peer_close because we don't want to carry the peer FIN in * the socket's receive queue and if we increment rcv_nxt without * having the FIN in the receive queue we'll confuse facilities such * as SIOCINQ. */ inp_wlock(tp->t_inpcb); tp->rcv_nxt++; tp->ts_recent_age = 0; /* defeat recycling */ tp->t_srtt = 0; /* defeat tcp_update_metrics */ inp_wunlock(tp->t_inpcb); tcp_offload_twstart(tp); } /* * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This * function deals with the data that may be reported along with the FIN. * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to * perform normal FIN-related processing. In the latter case 1 indicates that * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the * skb can be freed. */ static int handle_peer_close_data(struct socket *so, struct mbuf *m) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; struct ddp_state *q; struct ddp_buf_state *bsp; struct cpl_peer_close *req = cplhdr(m); unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ struct sockbuf *rcv; if (tp->rcv_nxt == rcv_nxt) /* no data */ return (0); CTR0(KTR_TOM, "handle_peer_close_data"); if (__predict_false(so_no_receive(so))) { handle_excess_rx(toep, m); /* * Although we discard the data we want to process the FIN so * that PEER_CLOSE + data behaves the same as RX_DATA_DDP + * PEER_CLOSE without data. In particular this PEER_CLOSE * may be what will close the connection. We return 1 because * handle_excess_rx() already freed the packet. */ return (1); } inp_lock_assert(tp->t_inpcb); q = &toep->tp_ddp_state; rcv = so_sockbuf_rcv(so); sockbuf_lock(rcv); bsp = &q->buf_state[q->cur_buf]; m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); m->m_ddp_gl = (unsigned char *)bsp->gl; m->m_flags |= M_DDP; m->m_cur_offset = bsp->cur_offset; m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; m->m_seq = tp->rcv_nxt; tp->rcv_nxt = rcv_nxt; bsp->cur_offset += m->m_pkthdr.len; if (!(bsp->flags & DDP_BF_NOFLIP)) q->cur_buf ^= 1; #ifdef notyet skb_reset_transport_header(skb); tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ #endif tp->t_rcvtime = ticks; SBAPPEND(rcv, m); if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) so_sorwakeup_locked(so); else sockbuf_unlock(rcv); return (1); } /* * Handle a peer FIN. */ static void do_peer_fin(struct toepcb *toep, struct mbuf *m) { struct socket *so; struct tcpcb *tp = toep->tp_tp; int keep, action; action = keep = 0; CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state); if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { printf("abort_pending set\n"); goto out; } inp_wlock(tp->t_inpcb); so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) { keep = handle_peer_close_data(so, m); if (keep < 0) { inp_wunlock(tp->t_inpcb); return; } } if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { CTR1(KTR_TOM, "waking up waiters for cantrcvmore on %p ", so); socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tp->t_state = TCPS_CLOSE_WAIT; break; case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; case TCPS_FIN_WAIT_2: /* * If we've sent an abort_req we must have sent it too late, * HW will send us a reply telling us so, and this peer_close * is really the last message for this connection and needs to * be treated as an abort_rpl, i.e., transition the connection * to TCP_CLOSE (note that the host stack does this at the * time of generating the RST but we must wait for HW). * Otherwise we enter TIME_WAIT. */ t3_release_offload_resources(toep); if (toep->tp_flags & TP_ABORT_RPL_PENDING) { action = TCP_CLOSE; } else { action = TCP_TIMEWAIT; } break; default: log(LOG_ERR, "%s: TID %u received PEER_CLOSE in bad state %d\n", toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state); } inp_wunlock(tp->t_inpcb); if (action == TCP_TIMEWAIT) { enter_timewait(tp); } else if (action == TCP_DROP) { tcp_offload_drop(tp, 0); } else if (action == TCP_CLOSE) { tcp_offload_close(tp); } #ifdef notyet /* Do not send POLL_HUP for half duplex close. */ if ((sk->sk_shutdown & SEND_SHUTDOWN) || sk->sk_state == TCP_CLOSE) sk_wake_async(so, 1, POLL_HUP); else sk_wake_async(so, 1, POLL_IN); #endif out: if (!keep) m_free(m); } /* * Handler for PEER_CLOSE CPL messages. */ static int do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct toepcb *toep = (struct toepcb *)ctx; VALIDATE_SOCK(so); do_peer_fin(toep, m); return (0); } static void process_close_con_rpl(struct toepcb *toep, struct mbuf *m) { struct cpl_close_con_rpl *rpl = cplhdr(m); struct tcpcb *tp = toep->tp_tp; struct socket *so; int action = 0; struct sockbuf *rcv; inp_wlock(tp->t_inpcb); so = inp_inpcbtosocket(tp->t_inpcb); tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { inp_wunlock(tp->t_inpcb); goto out; } CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep, tp->t_state, !!(so_state_get(so) & SS_NOFDREF)); switch (tp->t_state) { case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ t3_release_offload_resources(toep); if (toep->tp_flags & TP_ABORT_RPL_PENDING) { action = TCP_CLOSE; } else { action = TCP_TIMEWAIT; } break; case TCPS_LAST_ACK: /* * In this state we don't care about pending abort_rpl. * If we've sent abort_req it was post-close and was sent too * late, this close_con_rpl is the actual last message. */ t3_release_offload_resources(toep); action = TCP_CLOSE; break; case TCPS_FIN_WAIT_1: /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: * we should release the tp also, and use a * compressed state. */ if (so) rcv = so_sockbuf_rcv(so); else break; if (rcv->sb_state & SBS_CANTRCVMORE) { int timeout; if (so) soisdisconnected(so); timeout = (tcp_fast_finwait2_recycle) ? tcp_finwait2_timeout : tcp_maxidle; tcp_timer_activate(tp, TT_2MSL, timeout); } tp->t_state = TCPS_FIN_WAIT_2; if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 && (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { action = TCP_DROP; } break; default: log(LOG_ERR, "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state); } inp_wunlock(tp->t_inpcb); if (action == TCP_TIMEWAIT) { enter_timewait(tp); } else if (action == TCP_DROP) { tcp_offload_drop(tp, 0); } else if (action == TCP_CLOSE) { tcp_offload_close(tp); } out: m_freem(m); } /* * Handler for CLOSE_CON_RPL CPL messages. */ static int do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct toepcb *toep = (struct toepcb *)ctx; process_close_con_rpl(toep, m); return (0); } /* * Process abort replies. We only process these messages if we anticipate * them as the coordination between SW and HW in this area is somewhat lacking * and sometimes we get ABORT_RPLs after we are done with the connection that * originated the ABORT_REQ. */ static void process_abort_rpl(struct toepcb *toep, struct mbuf *m) { struct tcpcb *tp = toep->tp_tp; struct socket *so; int needclose = 0; #ifdef T3_TRACE T3_TRACE1(TIDTB(sk), "process_abort_rpl: GTS rpl pending %d", sock_flag(sk, ABORT_RPL_PENDING)); #endif inp_wlock(tp->t_inpcb); so = inp_inpcbtosocket(tp->t_inpcb); if (toep->tp_flags & TP_ABORT_RPL_PENDING) { /* * XXX panic on tcpdrop */ if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev)) toep->tp_flags |= TP_ABORT_RPL_RCVD; else { toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || !is_t3a(toep->tp_toedev)) { if (toep->tp_flags & TP_ABORT_REQ_RCVD) panic("TP_ABORT_REQ_RCVD set"); t3_release_offload_resources(toep); needclose = 1; } } } inp_wunlock(tp->t_inpcb); if (needclose) tcp_offload_close(tp); m_free(m); } /* * Handle an ABORT_RPL_RSS CPL message. */ static int do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct cpl_abort_rpl_rss *rpl = cplhdr(m); struct toepcb *toep; /* * Ignore replies to post-close aborts indicating that the abort was * requested too late. These connections are terminated when we get * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss * arrives the TID is either no longer used or it has been recycled. */ if (rpl->status == CPL_ERR_ABORT_FAILED) { discard: m_free(m); return (0); } toep = (struct toepcb *)ctx; /* * Sometimes we've already closed the socket, e.g., a post-close * abort races with ABORT_REQ_RSS, the latter frees the socket * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, * but FW turns the ABORT_REQ into a regular one and so we get * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. */ if (!toep) goto discard; if (toep->tp_tp == NULL) { log(LOG_NOTICE, "removing tid for abort\n"); cxgb_remove_tid(cdev, toep, toep->tp_tid); if (toep->tp_l2t) l2t_release(L2DATA(cdev), toep->tp_l2t); toepcb_release(toep); goto discard; } log(LOG_NOTICE, "toep=%p\n", toep); log(LOG_NOTICE, "tp=%p\n", toep->tp_tp); toepcb_hold(toep); process_abort_rpl(toep, m); toepcb_release(toep); return (0); } /* * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also * indicate whether RST should be sent in response. */ static int abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) { struct tcpcb *tp = so_sototcpcb(so); switch (abort_reason) { case CPL_ERR_BAD_SYN: #if 0 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through #endif case CPL_ERR_CONN_RESET: // XXX need to handle SYN_RECV due to crossed SYNs return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); case CPL_ERR_XMIT_TIMEDOUT: case CPL_ERR_PERSIST_TIMEDOUT: case CPL_ERR_FINWAIT2_TIMEDOUT: case CPL_ERR_KEEPALIVE_TIMEDOUT: #if 0 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); #endif return (ETIMEDOUT); default: return (EIO); } } static inline void set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) { struct cpl_abort_rpl *rpl = cplhdr(m); rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); rpl->wr.wr_lo = htonl(V_WR_TID(tid)); m->m_len = m->m_pkthdr.len = sizeof(*rpl); OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); rpl->cmd = cmd; } static void send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) { struct mbuf *reply_mbuf; struct cpl_abort_req_rss *req = cplhdr(m); reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); m_set_priority(m, CPL_PRIORITY_DATA); m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); m_free(m); } /* * Returns whether an ABORT_REQ_RSS message is a negative advice. */ static inline int is_neg_adv_abort(unsigned int status) { return status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE; } static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) { struct mbuf *reply_mbuf; struct cpl_abort_req_rss *req = cplhdr(m); reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); if (!reply_mbuf) { /* Defer the reply. Stick rst_status into req->cmd. */ req->status = rst_status; t3_defer_reply(m, tdev, send_deferred_abort_rpl); return; } m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); m_free(m); /* * XXX need to sync with ARP as for SYN_RECV connections we can send * these messages while ARP is pending. For other connection states * it's not a problem. */ cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); } #ifdef notyet static void cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) { CXGB_UNIMPLEMENTED(); #ifdef notyet struct request_sock *req = child->sk_user_data; inet_csk_reqsk_queue_removed(parent, req); synq_remove(tcp_sk(child)); __reqsk_free(req); child->sk_user_data = NULL; #endif } /* * Performs the actual work to abort a SYN_RECV connection. */ static void do_abort_syn_rcv(struct socket *child, struct socket *parent) { struct tcpcb *parenttp = so_sototcpcb(parent); struct tcpcb *childtp = so_sototcpcb(child); /* * If the server is still open we clean up the child connection, * otherwise the server already did the clean up as it was purging * its SYN queue and the skb was just sitting in its backlog. */ if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { cleanup_syn_rcv_conn(child, parent); inp_wlock(childtp->t_inpcb); t3_release_offload_resources(childtp->t_toe); inp_wunlock(childtp->t_inpcb); tcp_offload_close(childtp); } } #endif /* * Handle abort requests for a SYN_RECV connection. These need extra work * because the socket is on its parent's SYN queue. */ static int abort_syn_rcv(struct socket *so, struct mbuf *m) { CXGB_UNIMPLEMENTED(); #ifdef notyet struct socket *parent; struct toedev *tdev = toep->tp_toedev; struct t3cdev *cdev = TOM_DATA(tdev)->cdev; struct socket *oreq = so->so_incomp; struct t3c_tid_entry *t3c_stid; struct tid_info *t; if (!oreq) return -1; /* somehow we are not on the SYN queue */ t = &(T3C_DATA(cdev))->tid_maps; t3c_stid = lookup_stid(t, oreq->ts_recent); parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; so_lock(parent); do_abort_syn_rcv(so, parent); send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); so_unlock(parent); #endif return (0); } /* * Process abort requests. If we are waiting for an ABORT_RPL we ignore this * request except that we need to reply to it. */ static void process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev) { int rst_status = CPL_ABORT_NO_RST; const struct cpl_abort_req_rss *req = cplhdr(m); struct tcpcb *tp = toep->tp_tp; struct socket *so; int needclose = 0; inp_wlock(tp->t_inpcb); so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); m_free(m); goto skip; } toep->tp_flags &= ~TP_ABORT_REQ_RCVD; /* * Three cases to consider: * a) We haven't sent an abort_req; close the connection. * b) We have sent a post-close abort_req that will get to TP too late * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will * be ignored and the connection should be closed now. * c) We have sent a regular abort_req that will get to TP too late. * That will generate an abort_rpl with status 0, wait for it. */ if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { int error; error = abort_status_to_errno(so, req->status, &rst_status); so_error_set(so, error); if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) so_sorwakeup(so); /* * SYN_RECV needs special processing. If abort_syn_rcv() * returns 0 is has taken care of the abort. */ if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) goto skip; t3_release_offload_resources(toep); needclose = 1; } inp_wunlock(tp->t_inpcb); if (needclose) tcp_offload_close(tp); send_abort_rpl(m, tdev, rst_status); return; skip: inp_wunlock(tp->t_inpcb); } /* * Handle an ABORT_REQ_RSS CPL message. */ static int do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) { const struct cpl_abort_req_rss *req = cplhdr(m); struct toepcb *toep = (struct toepcb *)ctx; if (is_neg_adv_abort(req->status)) { m_free(m); return (0); } log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid); if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { cxgb_remove_tid(cdev, toep, toep->tp_tid); toep->tp_flags |= TP_ABORT_REQ_RCVD; send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); if (toep->tp_l2t) l2t_release(L2DATA(cdev), toep->tp_l2t); /* * Unhook */ toep->tp_tp->t_toe = NULL; toep->tp_tp->t_flags &= ~TF_TOE; toep->tp_tp = NULL; /* * XXX need to call syncache_chkrst - but we don't * have a way of doing that yet */ toepcb_release(toep); log(LOG_ERR, "abort for unestablished connection :-(\n"); return (0); } if (toep->tp_tp == NULL) { log(LOG_NOTICE, "disconnected toepcb\n"); /* should be freed momentarily */ return (0); } toepcb_hold(toep); process_abort_req(toep, m, toep->tp_toedev); toepcb_release(toep); return (0); } #ifdef notyet static void pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) { struct toedev *tdev = TOE_DEV(parent); do_abort_syn_rcv(child, parent); if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { struct cpl_pass_accept_rpl *rpl = cplhdr(m); rpl->opt0h = htonl(F_TCAM_BYPASS); rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); } else m_free(m); } #endif static void handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) { CXGB_UNIMPLEMENTED(); #ifdef notyet struct t3cdev *cdev; struct socket *parent; struct socket *oreq; struct t3c_tid_entry *t3c_stid; struct tid_info *t; struct tcpcb *otp, *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; /* * If the connection is being aborted due to the parent listening * socket going away there's nothing to do, the ABORT_REQ will close * the connection. */ if (toep->tp_flags & TP_ABORT_RPL_PENDING) { m_free(m); return; } oreq = so->so_incomp; otp = so_sototcpcb(oreq); cdev = T3C_DEV(so); t = &(T3C_DATA(cdev))->tid_maps; t3c_stid = lookup_stid(t, otp->ts_recent); parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; so_lock(parent); pass_open_abort(so, parent, m); so_unlock(parent); #endif } /* * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV * connection. */ static void pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) { #ifdef notyet TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); #endif handle_pass_open_arp_failure(m_get_socket(m), m); } /* * Populate a reject CPL_PASS_ACCEPT_RPL WR. */ static void mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) { struct cpl_pass_accept_req *req = cplhdr(req_mbuf); struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); unsigned int tid = GET_TID(req); m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet rpl->opt0h = htonl(F_TCAM_BYPASS); rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); rpl->opt2 = 0; rpl->rsvd = rpl->opt2; /* workaround for HW bug */ } /* * Send a deferred reject to an accept request. */ static void reject_pass_request(struct toedev *tdev, struct mbuf *m) { struct mbuf *reply_mbuf; reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); mk_pass_accept_rpl(reply_mbuf, m); cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); m_free(m); } static void handle_syncache_event(int event, void *arg) { struct toepcb *toep = arg; switch (event) { case TOE_SC_ENTRY_PRESENT: /* * entry already exists - free toepcb * and l2t */ printf("syncache entry present\n"); toepcb_release(toep); break; case TOE_SC_DROP: /* * The syncache has given up on this entry * either it timed out, or it was evicted * we need to explicitly release the tid */ printf("syncache entry dropped\n"); toepcb_release(toep); break; default: log(LOG_ERR, "unknown syncache event %d\n", event); break; } } static void syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) { struct in_conninfo inc; struct tcpopt to; struct tcphdr th; struct inpcb *inp; int mss, wsf, sack, ts; uint32_t rcv_isn = ntohl(req->rcv_isn); bzero(&to, sizeof(struct tcpopt)); inp = so_sotoinpcb(lso); /* * Fill out information for entering us into the syncache */ bzero(&inc, sizeof(inc)); inc.inc_fport = th.th_sport = req->peer_port; inc.inc_lport = th.th_dport = req->local_port; th.th_seq = req->rcv_isn; th.th_flags = TH_SYN; toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1; inc.inc_isipv6 = 0; inc.inc_len = 0; inc.inc_faddr.s_addr = req->peer_ip; inc.inc_laddr.s_addr = req->local_ip; DPRINTF("syncache add of %d:%d %d:%d\n", ntohl(req->local_ip), ntohs(req->local_port), ntohl(req->peer_ip), ntohs(req->peer_port)); mss = req->tcp_options.mss; wsf = req->tcp_options.wsf; ts = req->tcp_options.tstamp; sack = req->tcp_options.sack; to.to_mss = mss; to.to_wscale = wsf; to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep); } /* * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket * lock held. Note that the sock here is a listening socket that is not owned * by the TOE. */ static void process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, struct listen_ctx *lctx) { int rt_flags; struct l2t_entry *e; struct iff_mac tim; struct mbuf *reply_mbuf, *ddp_mbuf = NULL; struct cpl_pass_accept_rpl *rpl; struct cpl_pass_accept_req *req = cplhdr(m); unsigned int tid = GET_TID(req); struct tom_data *d = TOM_DATA(tdev); struct t3cdev *cdev = d->cdev; struct tcpcb *tp = so_sototcpcb(so); struct toepcb *newtoep; struct rtentry *dst; struct sockaddr_in nam; struct t3c_data *td = T3C_DATA(cdev); reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); if (__predict_false(reply_mbuf == NULL)) { if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) t3_defer_reply(m, tdev, reject_pass_request); else { cxgb_queue_tid_release(cdev, tid); m_free(m); } DPRINTF("failed to get reply_mbuf\n"); goto out; } if (tp->t_state != TCPS_LISTEN) { DPRINTF("socket not in listen state\n"); goto reject; } tim.mac_addr = req->dst_mac; tim.vlan_tag = ntohs(req->vlan_tag); if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); goto reject; } #ifdef notyet /* * XXX do route lookup to confirm that we're still listening on this * address */ if (ip_route_input(skb, req->local_ip, req->peer_ip, G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) goto reject; rt_flags = ((struct rtable *)skb->dst)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); dst_release(skb->dst); // done with the input route, release it skb->dst = NULL; if ((rt_flags & RTF_LOCAL) == 0) goto reject; #endif /* * XXX */ rt_flags = RTF_LOCAL; if ((rt_flags & RTF_LOCAL) == 0) goto reject; /* * Calculate values and add to syncache */ newtoep = toepcb_alloc(); if (newtoep == NULL) goto reject; bzero(&nam, sizeof(struct sockaddr_in)); nam.sin_len = sizeof(struct sockaddr_in); nam.sin_family = AF_INET; nam.sin_addr.s_addr =req->peer_ip; dst = rtalloc2((struct sockaddr *)&nam, 1, 0); if (dst == NULL) { printf("failed to find route\n"); goto reject; } e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, (struct sockaddr *)&nam); if (e == NULL) { DPRINTF("failed to get l2t\n"); } /* * Point to our listen socket until accept */ newtoep->tp_tp = tp; newtoep->tp_flags = TP_SYN_RCVD; newtoep->tp_tid = tid; newtoep->tp_toedev = tdev; tp->rcv_wnd = select_rcv_wnd(tdev, so); cxgb_insert_tid(cdev, d->client, newtoep, tid); so_lock(so); LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); so_unlock(so); newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) && tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; if (newtoep->tp_ulp_mode) { ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); if (ddp_mbuf == NULL) newtoep->tp_ulp_mode = 0; } CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d", TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode); set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); /* * XXX workaround for lack of syncache drop */ toepcb_hold(newtoep); syncache_add_accept_req(req, so, newtoep); rpl = cplhdr(reply_mbuf); reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); rpl->wr.wr_lo = 0; OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); rpl->opt2 = htonl(calc_opt2(so, tdev)); rpl->rsvd = rpl->opt2; /* workaround for HW bug */ rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) | CPL_PASS_OPEN_ACCEPT); DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep)); l2t_send(cdev, reply_mbuf, e); m_free(m); if (newtoep->tp_ulp_mode) { __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1) | TP_DDP_TIMER_WORKAROUND_MASK, V_TF_DDP_OFF(1) | TP_DDP_TIMER_WORKAROUND_VAL, 1); } else DPRINTF("no DDP\n"); return; reject: if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) mk_pass_accept_rpl(reply_mbuf, m); else mk_tid_release(reply_mbuf, newtoep, tid); cxgb_ofld_send(cdev, reply_mbuf); m_free(m); out: #if 0 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); #else return; #endif } /* * Handle a CPL_PASS_ACCEPT_REQ message. */ static int do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */ struct tom_data *d = listen_ctx->tom_data; #if VALIDATE_TID struct cpl_pass_accept_req *req = cplhdr(m); unsigned int tid = GET_TID(req); struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; if (unlikely(!lsk)) { printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", cdev->name, (unsigned long)((union listen_entry *)ctx - t->stid_tab)); return CPL_RET_BUF_DONE; } if (unlikely(tid >= t->ntids)) { printk(KERN_ERR "%s: passive open TID %u too large\n", cdev->name, tid); return CPL_RET_BUF_DONE; } /* * For T3A the current user of the TID may have closed but its last * message(s) may have been backlogged so the TID appears to be still * in use. Just take the TID away, the connection can close at its * own leisure. For T3B this situation is a bug. */ if (!valid_new_tid(t, tid) && cdev->type != T3A) { printk(KERN_ERR "%s: passive open uses existing TID %u\n", cdev->name, tid); return CPL_RET_BUF_DONE; } #endif process_pass_accept_req(lso, m, &d->tdev, listen_ctx); return (0); } /* * Called when a connection is established to translate the TCP options * reported by HW to FreeBSD's native format. */ static void assign_rxopt(struct socket *so, unsigned int opt) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep)); inp_lock_assert(tp->t_inpcb); toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) tp->rcv_scale = tp->request_r_scale; } /* * Completes some final bits of initialization for just established connections * and changes their state to TCP_ESTABLISHED. * * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. */ static void make_established(struct socket *so, u32 snd_isn, unsigned int opt) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; assign_rxopt(so, opt); /* *XXXXXXXXXXX * */ #ifdef notyet so->so_proto->pr_ctloutput = t3_ctloutput; #endif #if 0 inet_sk(sk)->id = tp->write_seq ^ jiffies; #endif /* * XXX not clear what rcv_wup maps to */ /* * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't * pass through opt0. */ if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); dump_toepcb(toep); #ifdef notyet /* * no clean interface for marking ARP up to date */ dst_confirm(sk->sk_dst_cache); #endif tp->t_starttime = ticks; tp->t_state = TCPS_ESTABLISHED; soisconnected(so); } static int syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) { struct in_conninfo inc; struct tcpopt to; struct tcphdr th; int mss, wsf, sack, ts; struct mbuf *m = NULL; const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); unsigned int opt; #ifdef MAC #error "no MAC support" #endif opt = ntohs(req->tcp_opt); bzero(&to, sizeof(struct tcpopt)); /* * Fill out information for entering us into the syncache */ bzero(&inc, sizeof(inc)); inc.inc_fport = th.th_sport = req->peer_port; inc.inc_lport = th.th_dport = req->local_port; th.th_seq = req->rcv_isn; th.th_flags = TH_ACK; inc.inc_isipv6 = 0; inc.inc_len = 0; inc.inc_faddr.s_addr = req->peer_ip; inc.inc_laddr.s_addr = req->local_ip; mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; wsf = G_TCPOPT_WSCALE_OK(opt); ts = G_TCPOPT_TSTAMP(opt); sack = G_TCPOPT_SACK(opt); to.to_mss = mss; to.to_wscale = G_TCPOPT_SND_WSCALE(opt); to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", ntohl(req->local_ip), ntohs(req->local_port), ntohl(req->peer_ip), ntohs(req->peer_port), mss, wsf, ts, sack); return tcp_offload_syncache_expand(&inc, &to, &th, so, m); } /* * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work * if we are in TCP_SYN_RECV due to crossed SYNs */ static int do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct cpl_pass_establish *req = cplhdr(m); struct toepcb *toep = (struct toepcb *)ctx; struct tcpcb *tp = toep->tp_tp; struct socket *so, *lso; struct t3c_data *td = T3C_DATA(cdev); struct sockbuf *snd, *rcv; // Complete socket initialization now that we have the SND_ISN struct toedev *tdev; tdev = toep->tp_toedev; inp_wlock(tp->t_inpcb); /* * * XXX need to add reference while we're manipulating */ so = lso = inp_inpcbtosocket(tp->t_inpcb); inp_wunlock(tp->t_inpcb); so_lock(so); LIST_REMOVE(toep, synq_entry); so_unlock(so); if (!syncache_expand_establish_req(req, &so, toep)) { /* * No entry */ CXGB_UNIMPLEMENTED(); } if (so == NULL) { /* * Couldn't create the socket */ CXGB_UNIMPLEMENTED(); } tp = so_sototcpcb(so); inp_wlock(tp->t_inpcb); snd = so_sockbuf_snd(so); rcv = so_sockbuf_rcv(so); snd->sb_flags |= SB_NOCOALESCE; rcv->sb_flags |= SB_NOCOALESCE; toep->tp_tp = tp; toep->tp_flags = 0; tp->t_toe = toep; reset_wr_list(toep); tp->rcv_wnd = select_rcv_wnd(tdev, so); tp->rcv_nxt = toep->tp_copied_seq; install_offload_ops(so); toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); toep->tp_wr_unacked = 0; toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); toep->tp_qset_idx = 0; toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); /* * XXX Cancel any keep alive timer */ make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); /* * XXX workaround for lack of syncache drop */ toepcb_release(toep); inp_wunlock(tp->t_inpcb); CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid); cxgb_log_tcb(cdev->adapter, toep->tp_tid); #ifdef notyet /* * XXX not sure how these checks map to us */ if (unlikely(sk->sk_socket)) { // simultaneous opens only sk->sk_state_change(sk); sk_wake_async(so, 0, POLL_OUT); } /* * The state for the new connection is now up to date. * Next check if we should add the connection to the parent's * accept queue. When the parent closes it resets connections * on its SYN queue, so check if we are being reset. If so we * don't need to do anything more, the coming ABORT_RPL will * destroy this socket. Otherwise move the connection to the * accept queue. * * Note that we reset the synq before closing the server so if * we are not being reset the stid is still open. */ if (unlikely(!tp->forward_skb_hint)) { // removed from synq __kfree_skb(skb); goto unlock; } #endif m_free(m); return (0); } /* * Fill in the right TID for CPL messages waiting in the out-of-order queue * and send them to the TOE. */ static void fixup_and_send_ofo(struct toepcb *toep) { struct mbuf *m; struct toedev *tdev = toep->tp_toedev; struct tcpcb *tp = toep->tp_tp; unsigned int tid = toep->tp_tid; log(LOG_NOTICE, "fixup_and_send_ofo\n"); inp_lock_assert(tp->t_inpcb); while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { /* * A variety of messages can be waiting but the fields we'll * be touching are common to all so any message type will do. */ struct cpl_close_con_req *p = cplhdr(m); p->wr.wr_lo = htonl(V_WR_TID(tid)); OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); } } /* * Updates socket state from an active establish CPL message. Runs with the * socket lock held. */ static void socket_act_establish(struct socket *so, struct mbuf *m) { INIT_VNET_INET(so->so_vnet); struct cpl_act_establish *req = cplhdr(m); u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; if (__predict_false(tp->t_state != TCPS_SYN_SENT)) log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", toep->tp_tid, tp->t_state); tp->ts_recent_age = ticks; tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); /* * Now that we finally have a TID send any CPL messages that we had to * defer for lack of a TID. */ if (mbufq_len(&toep->out_of_order_queue)) fixup_and_send_ofo(toep); if (__predict_false(so_state_get(so) & SS_NOFDREF)) { /* * XXX does this even make sense? */ so_sorwakeup(so); } m_free(m); #ifdef notyet /* * XXX assume no write requests permitted while socket connection is * incomplete */ /* * Currently the send queue must be empty at this point because the * socket layer does not send anything before a connection is * established. To be future proof though we handle the possibility * that there are pending buffers to send (either TX_DATA or * CLOSE_CON_REQ). First we need to adjust the sequence number of the * buffers according to the just learned write_seq, and then we send * them on their way. */ fixup_pending_writeq_buffers(sk); if (t3_push_frames(so, 1)) sk->sk_write_space(sk); #endif toep->tp_state = tp->t_state; V_tcpstat.tcps_connects++; } /* * Process a CPL_ACT_ESTABLISH message. */ static int do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct cpl_act_establish *req = cplhdr(m); unsigned int tid = GET_TID(req); unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); struct toepcb *toep = (struct toepcb *)ctx; struct tcpcb *tp = toep->tp_tp; struct socket *so; struct toedev *tdev; struct tom_data *d; if (tp == NULL) { free_atid(cdev, atid); return (0); } inp_wlock(tp->t_inpcb); /* * XXX */ so = inp_inpcbtosocket(tp->t_inpcb); tdev = toep->tp_toedev; /* blow up here if link was down */ d = TOM_DATA(tdev); /* * It's OK if the TID is currently in use, the owning socket may have * backlogged its last CPL message(s). Just take it away. */ toep->tp_tid = tid; toep->tp_tp = tp; so_insert_tid(d, toep, tid); free_atid(cdev, atid); toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); socket_act_establish(so, m); inp_wunlock(tp->t_inpcb); CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid); cxgb_log_tcb(cdev->adapter, toep->tp_tid); return (0); } /* * Process an acknowledgment of WR completion. Advance snd_una and send the * next batch of work requests from the write queue. */ static void wr_ack(struct toepcb *toep, struct mbuf *m) { struct tcpcb *tp = toep->tp_tp; struct cpl_wr_ack *hdr = cplhdr(m); struct socket *so; unsigned int credits = ntohs(hdr->credits); u32 snd_una = ntohl(hdr->snd_una); int bytes = 0; struct sockbuf *snd; CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits); inp_wlock(tp->t_inpcb); so = inp_inpcbtosocket(tp->t_inpcb); toep->tp_wr_avail += credits; if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; while (credits) { struct mbuf *p = peek_wr(toep); if (__predict_false(!p)) { log(LOG_ERR, "%u WR_ACK credits for TID %u with " "nothing pending, state %u wr_avail=%u\n", credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); break; } CTR2(KTR_TOM, "wr_ack: p->credits=%d p->bytes=%d", p->m_pkthdr.csum_data, p->m_pkthdr.len); KASSERT(p->m_pkthdr.csum_data != 0, ("empty request still on list")); if (__predict_false(credits < p->m_pkthdr.csum_data)) { #if DEBUG_WR > 1 struct tx_data_wr *w = cplhdr(p); log(LOG_ERR, "TID %u got %u WR credits, need %u, len %u, " "main body %u, frags %u, seq # %u, ACK una %u," " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", toep->tp_tid, credits, p->csum, p->len, p->len - p->data_len, skb_shinfo(p)->nr_frags, ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), toep->tp_wr_avail, count_pending_wrs(tp) - credits); #endif p->m_pkthdr.csum_data -= credits; break; } else { dequeue_wr(toep); credits -= p->m_pkthdr.csum_data; bytes += p->m_pkthdr.len; CTR3(KTR_TOM, "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d", p->m_pkthdr.len, credits, p->m_pkthdr.csum_data); m_free(p); } } #if DEBUG_WR check_wr_invariants(tp); #endif if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { #if VALIDATE_SEQ struct tom_data *d = TOM_DATA(TOE_DEV(so)); log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, toep->tp_tid, tp->snd_una); #endif goto out_free; } if (tp->snd_una != snd_una) { tp->snd_una = snd_una; tp->ts_recent_age = ticks; #ifdef notyet /* * Keep ARP entry "minty fresh" */ dst_confirm(sk->sk_dst_cache); #endif if (tp->snd_una == tp->snd_nxt) toep->tp_flags &= ~TP_TX_WAIT_IDLE; } snd = so_sockbuf_snd(so); if (bytes) { CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes); snd = so_sockbuf_snd(so); sockbuf_lock(snd); sbdrop_locked(snd, bytes); so_sowwakeup_locked(so); } if (snd->sb_sndptroff < snd->sb_cc) t3_push_frames(so, 0); out_free: inp_wunlock(tp->t_inpcb); m_free(m); } /* * Handler for TX_DATA_ACK CPL messages. */ static int do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) { struct toepcb *toep = (struct toepcb *)ctx; VALIDATE_SOCK(so); wr_ack(toep, m); return 0; } /* * Handler for TRACE_PKT CPL messages. Just sink these packets. */ static int do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx) { m_freem(m); return 0; } /* * Reset a connection that is on a listener's SYN queue or accept queue, * i.e., one that has not had a struct socket associated with it. * Must be called from process context. * * Modeled after code in inet_csk_listen_stop(). */ static void t3_reset_listen_child(struct socket *child) { struct tcpcb *tp = so_sototcpcb(child); t3_send_reset(tp->t_toe); } static void t3_child_disconnect(struct socket *so, void *arg) { struct tcpcb *tp = so_sototcpcb(so); if (tp->t_flags & TF_TOE) { inp_wlock(tp->t_inpcb); t3_reset_listen_child(so); inp_wunlock(tp->t_inpcb); } } /* * Disconnect offloaded established but not yet accepted connections sitting * on a server's accept_queue. We just send an ABORT_REQ at this point and * finish off the disconnect later as we may need to wait for the ABORT_RPL. */ void t3_disconnect_acceptq(struct socket *listen_so) { so_lock(listen_so); so_listeners_apply_all(listen_so, t3_child_disconnect, NULL); so_unlock(listen_so); } /* * Reset offloaded connections sitting on a server's syn queue. As above * we send ABORT_REQ and finish off when we get ABORT_RPL. */ void t3_reset_synq(struct listen_ctx *lctx) { struct toepcb *toep; so_lock(lctx->lso); while (!LIST_EMPTY(&lctx->synq_head)) { toep = LIST_FIRST(&lctx->synq_head); LIST_REMOVE(toep, synq_entry); toep->tp_tp = NULL; t3_send_reset(toep); cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); toepcb_release(toep); } so_unlock(lctx->lso); } int t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl, unsigned int nppods, unsigned int tag, unsigned int maxoff, unsigned int pg_off, unsigned int color) { unsigned int i, j, pidx; struct pagepod *p; struct mbuf *m; struct ulp_mem_io *req; unsigned int tid = toep->tp_tid; const struct tom_data *td = TOM_DATA(toep->tp_toedev); unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)", gl, nppods, tag, maxoff, pg_off, color); for (i = 0; i < nppods; ++i) { m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE); m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); req = mtod(m, struct ulp_mem_io *); m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE; req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); req->wr.wr_lo = 0; req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) | V_ULPTX_CMD(ULP_MEM_WRITE)); req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) | V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1)); p = (struct pagepod *)(req + 1); if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) { p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) | V_PPOD_COLOR(color)); p->pp_max_offset = htonl(maxoff); p->pp_page_offset = htonl(pg_off); p->pp_rsvd = 0; for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx) p->pp_addr[j] = pidx < gl->dgl_nelem ? htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0; } else p->pp_vld_tid = 0; /* mark sentinel page pods invalid */ send_or_defer(toep, m, 0); ppod_addr += PPOD_SIZE; } return (0); } /* * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command. */ static inline void mk_cpl_barrier_ulp(struct cpl_barrier *b) { struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b; txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8)); b->opcode = CPL_BARRIER; } /* * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command. */ static inline void mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno) { struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; txpkt = (struct ulp_txpkt *)req; txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid)); req->cpuno = htons(cpuno); } /* * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command. */ static inline void mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, unsigned int word, uint64_t mask, uint64_t val) { struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", tid, word, mask, val); txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); req->reply = V_NO_REPLY(1); req->cpu_idx = 0; req->word = htons(word); req->mask = htobe64(mask); req->val = htobe64(val); } /* * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command. */ static void mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack, unsigned int tid, unsigned int credits) { struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack; txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8)); OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid)); ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) | V_RX_CREDITS(credits)); } void t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx) { unsigned int wrlen; struct mbuf *m; struct work_request_hdr *wr; struct cpl_barrier *lock; struct cpl_set_tcb_field *req; struct cpl_get_tcb *getreq; struct ddp_state *p = &toep->tp_ddp_state; #if 0 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); #endif wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) + sizeof(*getreq); m = m_gethdr_nofail(wrlen); m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); wr = mtod(m, struct work_request_hdr *); bzero(wr, wrlen); wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); m->m_pkthdr.len = m->m_len = wrlen; lock = (struct cpl_barrier *)(wr + 1); mk_cpl_barrier_ulp(lock); req = (struct cpl_set_tcb_field *)(lock + 1); CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx); /* Hmmm, not sure if this actually a good thing: reactivating * the other buffer might be an issue if it has been completed * already. However, that is unlikely, since the fact that the UBUF * is not completed indicates that there is no oustanding data. */ if (bufidx == 0) mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_BUF0_VALID(1), V_TF_DDP_ACTIVE_BUF(1)); else mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_BUF1_VALID(1), 0); getreq = (struct cpl_get_tcb *)(req + 1); mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1)); /* Keep track of the number of oustanding CPL_GET_TCB requests */ p->get_tcb_count++; #ifdef T3_TRACE T3_TRACE1(TIDTB(so), "t3_cancel_ddpbuf: bufidx %u", bufidx); #endif cxgb_ofld_send(TOEP_T3C_DEV(toep), m); } /** * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one * @sk: the socket associated with the buffers * @bufidx: index of HW DDP buffer (0 or 1) * @tag0: new tag for HW buffer 0 * @tag1: new tag for HW buffer 1 * @len: new length for HW buf @bufidx * * Sends a compound WR to overlay a new DDP buffer on top of an existing * buffer by changing the buffer tag and length and setting the valid and * active flag accordingly. The caller must ensure the new buffer is at * least as big as the existing one. Since we typically reprogram both HW * buffers this function sets both tags for convenience. Read the TCB to * determine how made data was written into the buffer before the overlay * took place. */ void t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0, unsigned int tag1, unsigned int len) { unsigned int wrlen; struct mbuf *m; struct work_request_hdr *wr; struct cpl_get_tcb *getreq; struct cpl_set_tcb_field *req; struct ddp_state *p = &toep->tp_ddp_state; CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)", bufidx, tag0, tag1, len); #if 0 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); #endif wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq); m = m_gethdr_nofail(wrlen); m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); wr = mtod(m, struct work_request_hdr *); m->m_pkthdr.len = m->m_len = wrlen; bzero(wr, wrlen); /* Set the ATOMIC flag to make sure that TP processes the following * CPLs in an atomic manner and no wire segments can be interleaved. */ wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC); req = (struct cpl_set_tcb_field *)(wr + 1); mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG, V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) | V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32, V_TCB_RX_DDP_BUF0_TAG(tag0) | V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32); req++; if (bufidx == 0) { mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN, V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); req++; mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), V_TF_DDP_PUSH_DISABLE_0(0) | V_TF_DDP_BUF0_VALID(1)); } else { mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN, V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN), V_TCB_RX_DDP_BUF1_LEN((uint64_t)len)); req++; mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), V_TF_DDP_PUSH_DISABLE_1(0) | V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1)); } getreq = (struct cpl_get_tcb *)(req + 1); mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); /* Keep track of the number of oustanding CPL_GET_TCB requests */ p->get_tcb_count++; #ifdef T3_TRACE T3_TRACE4(TIDTB(sk), "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u " "len %d", bufidx, tag0, tag1, len); #endif cxgb_ofld_send(TOEP_T3C_DEV(toep), m); } /* * Sends a compound WR containing all the CPL messages needed to program the * two HW DDP buffers, namely optionally setting up the length and offset of * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK. */ void t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0, unsigned int len1, unsigned int offset1, uint64_t ddp_flags, uint64_t flag_mask, int modulate) { unsigned int wrlen; struct mbuf *m; struct work_request_hdr *wr; struct cpl_set_tcb_field *req; CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ", len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff); #if 0 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); #endif wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) + (len1 ? sizeof(*req) : 0) + (modulate ? sizeof(struct cpl_rx_data_ack) : 0); m = m_gethdr_nofail(wrlen); m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); wr = mtod(m, struct work_request_hdr *); bzero(wr, wrlen); wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); m->m_pkthdr.len = m->m_len = wrlen; req = (struct cpl_set_tcb_field *)(wr + 1); if (len0) { /* program buffer 0 offset and length */ mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET, V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0)); req++; } if (len1) { /* program buffer 1 offset and length */ mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET, V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32, V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) | V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32); req++; } mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask, ddp_flags); if (modulate) { mk_rx_data_ack_ulp(toep, (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid, toep->tp_copied_seq - toep->tp_rcv_wup); toep->tp_rcv_wup = toep->tp_copied_seq; } #ifdef T3_TRACE T3_TRACE5(TIDTB(sk), "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x " "modulate %d", len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff, modulate); #endif cxgb_ofld_send(TOEP_T3C_DEV(toep), m); } void t3_init_wr_tab(unsigned int wr_len) { int i; if (mbuf_wrs[1]) /* already initialized */ return; for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { int sgl_len = (3 * i) / 2 + (i & 1); sgl_len += 3; mbuf_wrs[i] = sgl_len <= wr_len ? 1 : 1 + (sgl_len - 2) / (wr_len - 1); } wrlen = wr_len * 8; } int t3_init_cpl_io(void) { #ifdef notyet tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); if (!tcphdr_skb) { log(LOG_ERR, "Chelsio TCP offload: can't allocate sk_buff\n"); return -1; } skb_put(tcphdr_skb, sizeof(struct tcphdr)); tcphdr_skb->h.raw = tcphdr_skb->data; memset(tcphdr_skb->data, 0, tcphdr_skb->len); #endif t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); return (0); } Index: head/sys/net/if.c =================================================================== --- head/sys/net/if.c (revision 185087) +++ head/sys/net/if.c (revision 185088) @@ -1,2850 +1,2853 @@ /*- * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_inet6.h" #include "opt_inet.h" #include "opt_mac.h" #include "opt_carp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) /*XXX*/ #include #include #ifdef INET6 #include #include #endif #endif #ifdef INET #include #endif #ifdef DEV_CARP #include #endif #include SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); void (*bstp_linkstate_p)(struct ifnet *ifp, int state); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void if_freemulti(struct ifmultiaddr *); static void if_grow(void); static void if_init(void *); static void if_qflush(struct ifaltq *); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static void if_slowtimo(void *); static void if_unroute(struct ifnet *, int flag, int fam); static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int if_rtdel(struct radix_node *, void *); static int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void if_start_deferred(void *context, int pending); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif -int if_index = 0; -int ifqmaxlen = IFQ_MAXLEN; +#ifdef VIMAGE_GLOBALS struct ifnethead ifnet; /* depend on static init XXX */ struct ifgrouphead ifg_head; +int if_index; +static int if_indexlim; +/* Table of ifnet/cdev by index. Locked with ifnet_lock. */ +static struct ifindex_entry *ifindex_table; +static struct knlist ifklist; +#endif + +int ifqmaxlen = IFQ_MAXLEN; struct mtx ifnet_lock; static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; -static int if_indexlim = 8; -static struct knlist ifklist; - -/* - * Table of ifnet/cdev by index. Locked with ifnet_lock. - */ -static struct ifindex_entry *ifindex_table = NULL; - static void filt_netdetach(struct knote *kn); static int filt_netdev(struct knote *kn, long hint); static struct filterops netdev_filtops = { 1, NULL, filt_netdetach, filt_netdev }; /* * System initialization */ SYSINIT(interfaces, SI_SUB_INIT_IF, SI_ORDER_FIRST, if_init, NULL); SYSINIT(interface_check, SI_SUB_PROTO_IF, SI_ORDER_FIRST, if_slowtimo, NULL); MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex(u_short idx) { INIT_VNET_NET(curvnet); struct ifnet *ifp; IFNET_RLOCK(); ifp = V_ifindex_table[idx].ife_ifnet; IFNET_RUNLOCK(); return (ifp); } static void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { INIT_VNET_NET(curvnet); IFNET_WLOCK_ASSERT(); V_ifindex_table[idx].ife_ifnet = ifp; } struct ifaddr * ifaddr_byindex(u_short idx) { INIT_VNET_NET(curvnet); struct ifaddr *ifa; IFNET_RLOCK(); ifa = ifnet_byindex(idx)->if_addr; IFNET_RUNLOCK(); return (ifa); } struct cdev * ifdev_byindex(u_short idx) { INIT_VNET_NET(curvnet); struct cdev *cdev; IFNET_RLOCK(); cdev = V_ifindex_table[idx].ife_dev; IFNET_RUNLOCK(); return (cdev); } static void ifdev_setbyindex(u_short idx, struct cdev *cdev) { INIT_VNET_NET(curvnet); IFNET_WLOCK(); V_ifindex_table[idx].ife_dev = cdev; IFNET_WUNLOCK(); } static d_open_t netopen; static d_close_t netclose; static d_ioctl_t netioctl; static d_kqfilter_t netkqfilter; static struct cdevsw net_cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDGIANT, .d_open = netopen, .d_close = netclose, .d_ioctl = netioctl, .d_name = "net", .d_kqfilter = netkqfilter, }; static int netopen(struct cdev *dev, int flag, int mode, struct thread *td) { return (0); } static int netclose(struct cdev *dev, int flags, int fmt, struct thread *td) { return (0); } static int netioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct ifnet *ifp; int error, idx; /* only support interface specific ioctls */ if (IOCGROUP(cmd) != 'i') return (EOPNOTSUPP); idx = dev2unit(dev); if (idx == 0) { /* * special network device, not interface. */ if (cmd == SIOCGIFCONF) return (ifconf(cmd, data)); /* XXX remove cmd */ #ifdef __amd64__ if (cmd == SIOCGIFCONF32) return (ifconf(cmd, data)); /* XXX remove cmd */ #endif return (EOPNOTSUPP); } ifp = ifnet_byindex(idx); if (ifp == NULL) return (ENXIO); error = ifhwioctl(cmd, ifp, data, td); if (error == ENOIOCTL) error = EOPNOTSUPP; return (error); } static int netkqfilter(struct cdev *dev, struct knote *kn) { INIT_VNET_NET(curvnet); struct knlist *klist; struct ifnet *ifp; int idx; switch (kn->kn_filter) { case EVFILT_NETDEV: kn->kn_fop = &netdev_filtops; break; default: return (EINVAL); } idx = dev2unit(dev); if (idx == 0) { klist = &V_ifklist; } else { ifp = ifnet_byindex(idx); if (ifp == NULL) return (1); klist = &ifp->if_klist; } kn->kn_hook = (caddr_t)klist; knlist_add(klist, kn, 0); return (0); } static void filt_netdetach(struct knote *kn) { struct knlist *klist = (struct knlist *)kn->kn_hook; knlist_remove(klist, kn, 0); } static int filt_netdev(struct knote *kn, long hint) { struct knlist *klist = (struct knlist *)kn->kn_hook; /* * Currently NOTE_EXIT is abused to indicate device detach. */ if (hint == NOTE_EXIT) { kn->kn_data = NOTE_LINKINV; kn->kn_flags |= (EV_EOF | EV_ONESHOT); knlist_remove_inevent(klist, kn); return (1); } if (hint != 0) kn->kn_data = hint; /* current status */ if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ /* ARGSUSED*/ static void if_init(void *dummy __unused) { INIT_VNET_NET(curvnet); + + V_if_index = 0; + V_ifindex_table = NULL; + V_if_indexlim = 8; IFNET_LOCK_INIT(); TAILQ_INIT(&V_ifnet); TAILQ_INIT(&V_ifg_head); knlist_init(&V_ifklist, NULL, NULL, NULL, NULL); if_grow(); /* create initial table */ ifdev_setbyindex(0, make_dev(&net_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "network")); if_clone_init(); } static void if_grow(void) { INIT_VNET_NET(curvnet); u_int n; struct ifindex_entry *e; V_if_indexlim <<= 1; n = V_if_indexlim * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); free((caddr_t)V_ifindex_table, M_IFNET); } V_ifindex_table = e; } /* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ struct ifnet* if_alloc(u_char type) { INIT_VNET_NET(curvnet); struct ifnet *ifp; ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO); /* * Try to find an empty slot below if_index. If we fail, take * the next slot. * * XXX: should be locked! */ for (ifp->if_index = 1; ifp->if_index <= V_if_index; ifp->if_index++) { if (ifnet_byindex(ifp->if_index) == NULL) break; } /* Catch if_index overflow. */ if (ifp->if_index < 1) { free(ifp, M_IFNET); return (NULL); } if (ifp->if_index > V_if_index) V_if_index = ifp->if_index; if (V_if_index >= V_if_indexlim) if_grow(); ifp->if_type = type; if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { free(ifp, M_IFNET); return (NULL); } } IFNET_WLOCK(); ifnet_setbyindex(ifp->if_index, ifp); IFNET_WUNLOCK(); IF_ADDR_LOCK_INIT(ifp); return (ifp); } /* * Free the struct ifnet, the associated index, and the layer 2 common * structure if needed. All the work is done in if_free_type(). * * Do not add code to this function! Add it to if_free_type(). */ void if_free(struct ifnet *ifp) { if_free_type(ifp, ifp->if_type); } /* * Do the actual work of freeing a struct ifnet, associated index, and * layer 2 common structure. This version should only be called by * intefaces that switch their type after calling if_alloc(). */ void if_free_type(struct ifnet *ifp, u_char type) { INIT_VNET_NET(curvnet); /* ifp->if_vnet can be NULL here ! */ if (ifp != ifnet_byindex(ifp->if_index)) { if_printf(ifp, "%s: value was not if_alloced, skipping\n", __func__); return; } IFNET_WLOCK(); ifnet_setbyindex(ifp->if_index, NULL); /* XXX: should be locked with if_findindex() */ while (V_if_index > 0 && ifnet_byindex(V_if_index) == NULL) V_if_index--; IFNET_WUNLOCK(); if (if_com_free[type] != NULL) if_com_free[type](ifp->if_l2com, type); IF_ADDR_LOCK_DESTROY(ifp); free(ifp, M_IFNET); }; /* * Perform generic interface initalization tasks and attach the interface * to the list of "active" interfaces. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We do more initialization here then is probably a good idea. * Some of this should probably move to if_alloc(). * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { INIT_VNET_NET(curvnet); unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); TASK_INIT(&ifp->if_starttask, 0, if_start_deferred, ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); IF_AFDATA_LOCK_INIT(ifp); ifp->if_afdata_initialized = 0; TAILQ_INIT(&ifp->if_addrhead); TAILQ_INIT(&ifp->if_prefixhead); TAILQ_INIT(&ifp->if_multiaddrs); TAILQ_INIT(&ifp->if_groups); if_addgroup(ifp, IFG_ALL); knlist_init(&ifp->if_klist, NULL, NULL, NULL, NULL); getmicrotime(&ifp->if_lastchange); ifp->if_data.ifi_epoch = time_uptime; ifp->if_data.ifi_datalen = sizeof(struct if_data); #ifdef MAC mac_ifnet_init(ifp); mac_ifnet_create(ifp); #endif ifdev_setbyindex(ifp->if_index, make_dev(&net_cdevsw, ifp->if_index, UID_ROOT, GID_WHEEL, 0600, "%s/%s", net_cdevsw.d_name, ifp->if_xname)); make_dev_alias(ifdev_byindex(ifp->if_index), "%s%d", net_cdevsw.d_name, ifp->if_index); mtx_init(&ifp->if_snd.ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); /* * create a Link Level name for this device */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we can do * a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = malloc(ifasize, M_IFADDR, M_WAITOK | M_ZERO); IFA_LOCK_INIT(ifa); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_rtrequest = link_rtrequest; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; ifa->ifa_refcnt = 1; TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); ifp->if_broadcastaddr = NULL; /* reliably crash if used uninitialized */ /* * XXX: why do we warn about this? We're correcting it and most * drivers just set the value the way we do. */ if (ifp->if_snd.ifq_maxlen == 0) { if_printf(ifp, "XXX: driver didn't set ifq_maxlen\n"); ifp->if_snd.ifq_maxlen = ifqmaxlen; } ifp->if_snd.altq_type = 0; ifp->if_snd.altq_disc = NULL; ifp->if_snd.altq_flags &= ALTQF_CANTCHANGE; ifp->if_snd.altq_tbr = NULL; ifp->if_snd.altq_ifp = ifp; IFNET_WLOCK(); TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); if (ifp->if_watchdog != NULL) if_printf(ifp, "WARNING: using obsoleted if_watchdog interface\n"); if (ifp->if_flags & IFF_NEEDSGIANT) if_printf(ifp, "WARNING: using obsoleted IFF_NEEDSGIANT flag\n"); } static void if_attachdomain(void *dummy) { INIT_VNET_NET(curvnet); struct ifnet *ifp; int s; s = splnet(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); splx(s); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; int s; s = splnet(); /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ if (IF_AFDATA_TRYLOCK(ifp) == 0) { splx(s); return; } if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); splx(s); printf("if_attachdomain called more than once on %s\n", ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } splx(s); } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa, *next; TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family == AF_LINK) continue; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeaddr(ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); IFAFREE(ifa); } } /* * Remove any multicast network addresses from an interface. */ void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_LOCK(ifp); TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 1); IF_ADDR_UNLOCK(ifp); } /* * Detach an interface, removing it from the * list of "active" interfaces. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { INIT_VNET_NET(ifp->if_vnet); struct ifaddr *ifa; struct radix_node_head *rnh; int s; int i; struct domain *dp; struct ifnet *iter; int found = 0; IFNET_WLOCK(); TAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { TAILQ_REMOVE(&V_ifnet, ifp, if_link); found = 1; break; } IFNET_WUNLOCK(); if (!found) return; /* * Remove/wait for pending events. */ taskqueue_drain(taskqueue_swi, &ifp->if_linktask); /* * Remove routes and flush queues. */ s = splnet(); if_down(ifp); #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); /* * Remove link ifaddr pointer and maybe decrement if_index. * Clean up all addresses. */ ifp->if_addr = NULL; destroy_dev(ifdev_byindex(ifp->if_index)); ifdev_setbyindex(ifp->if_index, NULL); /* We can now free link ifaddr. */ if (!TAILQ_EMPTY(&ifp->if_addrhead)) { ifa = TAILQ_FIRST(&ifp->if_addrhead); TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); IFAFREE(ifa); } /* * Delete all remaining routes using this interface * Unfortuneatly the only way to do this is to slog through * the entire routing table looking for routes which point * to this interface...oh well... */ for (i = 1; i <= AF_MAX; i++) { int j; for (j = 0; j < rt_numfibs; j++) { if ((rnh = V_rt_tables[j][i]) == NULL) continue; RADIX_NODE_HEAD_LOCK(rnh); (void) rnh->rnh_walktree(rnh, if_rtdel, ifp); RADIX_NODE_HEAD_UNLOCK(rnh); } } /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); IF_AFDATA_LOCK(ifp); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); } IF_AFDATA_UNLOCK(ifp); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ KNOTE_UNLOCKED(&ifp->if_klist, NOTE_EXIT); knlist_clear(&ifp->if_klist, 0); knlist_destroy(&ifp->if_klist); mtx_destroy(&ifp->if_snd.ifq_mtx); IF_AFDATA_DESTROY(ifp); splx(s); } /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { INIT_VNET_NET(ifp->if_vnet); struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; TAILQ_INIT(&ifg->ifg_members); EVENTHANDLER_INVOKE(group_attach_event, ifg); TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_LOCK(ifp); TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_UNLOCK(ifp); IFNET_WUNLOCK(); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { INIT_VNET_NET(ifp->if_vnet); struct ifg_list *ifgl; struct ifg_member *ifgm; IFNET_WLOCK(); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } IF_ADDR_LOCK(ifp); TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_UNLOCK(ifp); TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) { TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next); free(ifgm, M_TEMP); } if (--ifgl->ifgl_group->ifg_refcnt == 0) { TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } IFNET_WUNLOCK(); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Stores all groups from an interface in memory pointed * to by data */ static int if_getgroup(struct ifgroupreq *data, struct ifnet *ifp) { int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; struct ifgroupreq *ifgr = data; if (ifgr->ifgr_len == 0) { IF_ADDR_LOCK(ifp); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); IF_ADDR_UNLOCK(ifp); return (0); } len = ifgr->ifgr_len; ifgp = ifgr->ifgr_groups; /* XXX: wire */ IF_ADDR_LOCK(ifp); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) { IF_ADDR_UNLOCK(ifp); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IF_ADDR_UNLOCK(ifp); return (error); } len -= sizeof(ifgrq); ifgp++; } IF_ADDR_UNLOCK(ifp); return (0); } /* * Stores all members of a group in memory pointed to by data */ static int if_getgroupmembers(struct ifgroupreq *data) { INIT_VNET_NET(curvnet); struct ifgroupreq *ifgr = data; struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr->ifgr_groups; TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Delete Routes for a Network Interface * * Called for each routing entry via the rnh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * * Arguments: * rn pointer to node in the routing table * arg argument passed to rnh->rnh_walktree() - detaching interface * * Returns: * 0 successful * errno failed - reason indicated * */ static int if_rtdel(struct radix_node *rn, void *arg) { struct rtentry *rt = (struct rtentry *)rn; struct ifnet *ifp = arg; int err; if (rt->rt_ifp == ifp) { /* * Protect (sorta) against walktree recursion problems * with cloned routes */ if ((rt->rt_flags & RTF_UP) == 0) return (0); err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, (struct rtentry **) NULL, rt->rt_fibnum); if (err) { log(LOG_WARNING, "if_rtdel: error %d\n", err); } } return (0); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_equal(a1, a2) \ (bcmp((a1), (a2), ((a1))->sa_len) == 0) #define sa_dl_equal(a1, a2) \ ((((struct sockaddr_dl *)(a1))->sdl_len == \ ((struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(LLADDR((struct sockaddr_dl *)(a1)), \ LLADDR((struct sockaddr_dl *)(a2)), \ ((struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithaddr(struct sockaddr *addr) { INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) goto done; /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) goto done; } ifa = NULL; done: IFNET_RUNLOCK(); return (ifa); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * ifa_ifwithbroadaddr(struct sockaddr *addr) { INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) goto done; } ifa = NULL; done: IFNET_RUNLOCK(); return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(struct sockaddr *addr) { INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) goto done; } } ifa = NULL; done: IFNET_RUNLOCK(); return (ifa); } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(struct sockaddr *addr) { INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = (struct ifaddr *) 0; u_int af = addr->sa_family; char *addr_data = addr->sa_data, *cplim; /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } /* * Scan though each interface, looking for ones that have * addresses in this address family. */ IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { /* * if we have a special address handler, * then use it instead of the generic one. */ if (ifa->ifa_claim_addr) { if ((*ifa->ifa_claim_addr)(ifa, addr)) goto done; continue; } /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one) then remember the new one * before continuing to search * for an even better one. */ if (ifa_maybe == 0 || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) ifa_maybe = ifa; } } } ifa = ifa_maybe; done: IFNET_RUNLOCK(); return (ifa); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = 0; u_int af = addr->sa_family; if (af >= AF_MAX) return (0); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == 0) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: return (ifa); } #include /* * Default action when installing a route with a Link Level gateway. * Lookup an appropriate real ifa to point to. * This should be moved to /sys/net/link.c eventually. */ static void link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) { struct ifaddr *ifa, *oifa; struct sockaddr *dst; struct ifnet *ifp; RT_LOCK_ASSERT(rt); if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) || ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0)) return; ifa = ifaof_ifpforaddr(dst, ifp); if (ifa) { IFAREF(ifa); /* XXX */ oifa = rt->rt_ifa; rt->rt_ifa = ifa; IFAFREE(oifa); if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) ifa->ifa_rtrequest(cmd, rt, info); } } /* * Mark an interface down and notify protocols of * the transition. * NOTE: must be called at splnet or eqivalent. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); if_qflush(&ifp->if_snd); #ifdef DEV_CARP if (ifp->if_carp) carp_carpdev_state(ifp->if_carp); #endif rt_ifmsg(ifp); } /* * Mark an interface up and notify protocols of * the transition. * NOTE: must be called at splnet or eqivalent. */ static void if_route(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); ifp->if_flags |= flag; getmicrotime(&ifp->if_lastchange); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); #ifdef DEV_CARP if (ifp->if_carp) carp_carpdev_state(ifp->if_carp); #endif rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); #endif } void (*vlan_link_state_p)(struct ifnet *, int); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { struct ifnet *ifp = (struct ifnet *)arg; int link_state = ifp->if_link_state; int link; CURVNET_SET(ifp->if_vnet); /* Notify that the link state has changed. */ rt_ifmsg(ifp); if (link_state == LINK_STATE_UP) link = NOTE_LINKUP; else if (link_state == LINK_STATE_DOWN) link = NOTE_LINKDOWN; else link = NOTE_LINKINV; KNOTE_UNLOCKED(&ifp->if_klist, link); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp, link); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && IFP2AC(ifp)->ac_netgraph != NULL) (*ng_ether_link_state_p)(ifp, link_state); #ifdef DEV_CARP if (ifp->if_carp) carp_carpdev_state(ifp->if_carp); #endif if (ifp->if_bridge) { KASSERT(bstp_linkstate_p != NULL,("if_bridge bstp not loaded!")); (*bstp_linkstate_p)(ifp, link_state); } if (ifp->if_lagg) { KASSERT(lagg_linkstate_p != NULL,("if_lagg not loaded!")); (*lagg_linkstate_p)(ifp, link_state); } devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname, (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); CURVNET_RESTORE(); } /* * Mark an interface down and notify protocols of * the transition. * NOTE: must be called at splnet or eqivalent. */ void if_down(struct ifnet *ifp) { if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. * NOTE: must be called at splnet or eqivalent. */ void if_up(struct ifnet *ifp) { if_route(ifp, IFF_UP, AF_UNSPEC); } /* * Flush an interface queue. */ static void if_qflush(struct ifaltq *ifq) { struct mbuf *m, *n; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != 0) { n = m->m_act; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Handle interface watchdog timer routines. Called * from softclock, we decrement timers (if set) and * call the appropriate interface routine on expiration. * * XXXRW: Note that because timeouts run with Giant, if_watchdog() is called * holding Giant. If we switch to an MPSAFE callout, we likely need to grab * Giant before entering if_watchdog() on an IFF_NEEDSGIANT interface. */ static void if_slowtimo(void *arg) { VNET_ITERATOR_DECL(vnet_iter); struct ifnet *ifp; int s = splimp(); IFNET_RLOCK(); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INIT_VNET_NET(vnet_iter); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_timer == 0 || --ifp->if_timer) continue; if (ifp->if_watchdog) (*ifp->if_watchdog)(ifp); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); IFNET_RUNLOCK(); splx(s); timeout(if_slowtimo, (void *)0, hz / IFNET_SLOWHZ); } /* * Map interface name to * interface structure pointer. */ struct ifnet * ifunit(const char *name) { INIT_VNET_NET(curvnet); struct ifnet *ifp; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } IFNET_RUNLOCK(); return (ifp); } /* * Hardware specific interface ioctls. */ static int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; struct ifstat *ifs; int error = 0; int new_flags, temp_flags; size_t namelen, onamelen; char new_name[IFNAMSIZ]; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: ifr->ifr_phys = ifp->if_physical; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_SMART) { /* Smart drivers twiddle their own routes */ } else if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { int s = splimp(); if_down(ifp); splx(s); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { int s = splimp(); if_up(ifp); splx(s); } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; log(LOG_INFO, "%s: permanently promiscuous mode %s\n", ifp->if_xname, (new_flags & IFF_PPROMISC) ? "enabled" : "disabled"); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { IFF_LOCKGIANT(ifp); (void) (*ifp->if_ioctl)(ifp, cmd, data); IFF_UNLOCKGIANT(ifp); } getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); IFF_LOCKGIANT(ifp); error = (*ifp->if_ioctl)(ifp, cmd, data); IFF_UNLOCKGIANT(ifp); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL); if (error != 0) return (error); if (new_name[0] == '\0') return (EINVAL); if (ifunit(new_name) != NULL) return (EEXIST); /* Announce the departure of the interface. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); log(LOG_INFO, "%s: changing name to '%s'\n", ifp->if_xname, new_name); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; IFA_LOCK(ifa); sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IFA_UNLOCK(ifa); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); break; case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); IFF_LOCKGIANT(ifp); error = (*ifp->if_ioctl)(ifp, cmd, data); IFF_UNLOCKGIANT(ifp); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); IFF_LOCKGIANT(ifp); error = (*ifp->if_ioctl)(ifp, cmd, data); IFF_UNLOCKGIANT(ifp); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp); } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif } break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ IF_ADDR_LOCK(ifp); ifma = if_findmulti(ifp, &ifr->ifr_addr); IF_ADDR_UNLOCK(ifp); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSLIFPHYADDR: case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); IFF_LOCKGIANT(ifp); error = (*ifp->if_ioctl)(ifp, cmd, data); IFF_UNLOCKGIANT(ifp); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: ifs = (struct ifstat *)data; ifs->ascii[0] = '\0'; case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGLIFPHYADDR: case SIOCGIFMEDIA: case SIOCGIFGENERIC: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); IFF_LOCKGIANT(ifp); error = (*ifp->if_ioctl)(ifp, cmd, data); IFF_UNLOCKGIANT(ifp); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); break; case SIOCAIFGROUP: { struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr; error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); if ((error = if_addgroup(ifp, ifgr->ifgr_group))) return (error); break; } case SIOCGIFGROUP: if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp))) return (error); break; case SIOCDIFGROUP: { struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr; error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); if ((error = if_delgroup(ifp, ifgr->ifgr_group))) return (error); break; } default: error = ENOIOCTL; break; } return (error); } /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; switch (cmd) { case SIOCGIFCONF: case OSIOCGIFCONF: #ifdef __amd64__ case SIOCGIFCONF32: #endif return (ifconf(cmd, data)); } ifr = (struct ifreq *)data; switch (cmd) { case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error) return (error); return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL)); case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error) return (error); return if_clone_destroy(ifr->ifr_name); case SIOCIFGCLONERS: return (if_clone_list((struct if_clonereq *)data)); case SIOCGIFGMEMB: return (if_getgroupmembers((struct ifgroupreq *)data)); } ifp = ifunit(ifr->ifr_name); if (ifp == 0) return (ENXIO); error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) return (error); oif_flags = ifp->if_flags; if (so->so_proto == 0) return (EOPNOTSUPP); #ifndef COMPAT_43 error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); #else { int ocmd = cmd; switch (cmd) { case SIOCSIFDSTADDR: case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFNETMASK: #if BYTE_ORDER != BIG_ENDIAN if (ifr->ifr_addr.sa_family == 0 && ifr->ifr_addr.sa_len < 16) { ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len; ifr->ifr_addr.sa_len = 16; } #else if (ifr->ifr_addr.sa_len == 0) ifr->ifr_addr.sa_len = 16; #endif break; case OSIOCGIFADDR: cmd = SIOCGIFADDR; break; case OSIOCGIFDSTADDR: cmd = SIOCGIFDSTADDR; break; case OSIOCGIFBRDADDR: cmd = SIOCGIFBRDADDR; break; case OSIOCGIFNETMASK: cmd = SIOCGIFNETMASK; } error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); switch (ocmd) { case OSIOCGIFADDR: case OSIOCGIFDSTADDR: case OSIOCGIFBRDADDR: case OSIOCGIFNETMASK: *(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family; } } #endif /* COMPAT_43 */ if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 DELAY(100);/* XXX: temporary workaround for fxp issue*/ if (ifp->if_flags & IFF_UP) { int s = splimp(); in6_if_up(ifp); splx(s); } #endif } return (error); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; IFF_LOCKGIANT(ifp); error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); IFF_UNLOCKGIANT(ifp); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC)) log(LOG_INFO, "%s: promiscuous mode %s\n", ifp->if_xname, (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { INIT_VNET_NET(curvnet); struct ifconf *ifc = (struct ifconf *)data; #ifdef __amd64__ struct ifconf32 *ifc32 = (struct ifconf32 *)data; struct ifconf ifc_swab; #endif struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; #ifdef __amd64__ if (cmd == SIOCGIFCONF32) { ifc_swab.ifc_len = ifc32->ifc_len; ifc_swab.ifc_buf = (caddr_t)(uintptr_t)ifc32->ifc_buf; ifc = &ifc_swab; } #endif /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ max_len = MAXPHYS - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); /* could sleep XXX */ TAILQ_FOREACH(ifp, &V_ifnet, if_link) { int addrs; /* * Zero the ifr_name buffer to make sure we don't * disclose the contents of the stack. */ memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (jailed(curthread->td_ucred) && prison_if(curthread->td_ucred, sa)) continue; addrs++; #ifdef COMPAT_43 if (cmd == OSIOCGIFCONF) { struct osockaddr *osa = (struct osockaddr *)&ifr.ifr_addr; ifr.ifr_addr = *sa; osa->sa_family = sa->sa_family; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else #endif if (sa->sa_len <= sizeof(*sa)) { ifr.ifr_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (!sbuf_overflowed(sb)) valid_len = sbuf_len(sb); } if (addrs == 0) { bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr)); sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (!sbuf_overflowed(sb)) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; #ifdef __amd64__ if (cmd == SIOCGIFCONF32) ifc32->ifc_len = valid_len; #endif sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ static void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); KASSERT(ifma->ifma_protospec == NULL, ("if_freemulti: protospec not NULL")); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; int error; /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_LOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_UNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. If 'llsa' was * returned, we will need to free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_UNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { IFF_LOCKGIANT(ifp); (void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); IFF_UNLOCKGIANT(ifp); } if (llsa != NULL) free(llsa, M_IFMADDR); return (0); free_llsa_out: if (llsa != NULL) free(llsa, M_IFMADDR); unlock_out: IF_ADDR_UNLOCK(ifp); return (error); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; #ifdef INVARIANTS struct ifnet *oifp; INIT_VNET_NET(ifp->if_vnet); IFNET_RLOCK(); TAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; IFNET_RUNLOCK(); KASSERT(ifp != NULL, ("%s: ifnet went away", __func__)); #endif if (ifp == NULL) return (ENOENT); IF_ADDR_LOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_UNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { IFF_LOCKGIANT(ifp); (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); IFF_UNLOCKGIANT(ifp); } return (0); } /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. Callers should * hold IFF_LOCKGIANT() to avoid a LOR in case the hardware needs to be * reconfigured. */ void if_delmulti_ifma(struct ifmultiaddr *ifma) { #ifdef DIAGNOSTIC INIT_VNET_NET(curvnet); #endif struct ifnet *ifp; int lastref; ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { struct ifnet *oifp; IFNET_RLOCK(); TAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) { printf("%s: ifnet %p disappeared\n", __func__, ifp); ifp = NULL; } IFNET_RUNLOCK(); } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_LOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, 0); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_UNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { IFF_LOCKGIANT(ifp); (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); IFF_UNLOCKGIANT(ifp); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_LOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifma_link); } if_freemulti(ll_ifma); } } if (ifp != NULL) TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link); if_freemulti(ifma); /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. */ int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; ifa = ifp->if_addr; if (ifa == NULL) return (EINVAL); sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) return (EINVAL); if (len != sdl->sdl_alen) /* don't allow length to change */ return (EINVAL); switch (ifp->if_type) { case IFT_ETHER: case IFT_FDDI: case IFT_XETHER: case IFT_ISO88025: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_ARCNET: case IFT_IEEE8023ADLAG: bcopy(lladdr, LLADDR(sdl), len); break; default: return (ENODEV); } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { IFF_LOCKGIANT(ifp); ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); IFF_UNLOCKGIANT(ifp); } #ifdef INET /* * Also send gratuitous ARPs to notify other nodes about * the address change. */ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); } #endif } return (0); } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } int if_printf(struct ifnet *ifp, const char * fmt, ...) { va_list ap; int retval; retval = printf("%s: ", ifp->if_xname); va_start(ap, fmt); retval += vprintf(fmt, ap); va_end(ap); return (retval); } /* * When an interface is marked IFF_NEEDSGIANT, its if_start() routine cannot * be called without Giant. However, we often can't acquire the Giant lock * at those points; instead, we run it via a task queue that holds Giant via * if_start_deferred. * * XXXRW: We need to make sure that the ifnet isn't fully detached until any * outstanding if_start_deferred() tasks that will run after the free. This * probably means waiting in if_detach(). */ void if_start(struct ifnet *ifp) { if (ifp->if_flags & IFF_NEEDSGIANT) { if (mtx_owned(&Giant)) (*(ifp)->if_start)(ifp); else taskqueue_enqueue(taskqueue_swi_giant, &ifp->if_starttask); } else (*(ifp)->if_start)(ifp); } static void if_start_deferred(void *context, int pending) { struct ifnet *ifp; GIANT_REQUIRED; ifp = context; (ifp->if_start)(ifp); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { _IF_DROP(ifq); IF_UNLOCK(ifq); m_freem(m); return (0); } if (ifp != NULL) { ifp->if_obytes += m->m_pkthdr.len + adjust; if (m->m_flags & (M_BCAST|M_MCAST)) ifp->if_omcasts++; active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) if_start(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } Index: head/sys/net/if_ethersubr.c =================================================================== --- head/sys/net/if_ethersubr.c (revision 185087) +++ head/sys/net/if_ethersubr.c (revision 185088) @@ -1,1291 +1,1293 @@ /*- * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #include "opt_atalk.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipx.h" #include "opt_mac.h" #include "opt_netgraph.h" #include "opt_carp.h" #include "opt_mbuf_profiling.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #endif #ifdef INET6 #include #endif #ifdef DEV_CARP #include #endif #ifdef IPX #include #include #endif int (*ef_inputp)(struct ifnet*, struct ether_header *eh, struct mbuf *m); int (*ef_outputp)(struct ifnet *ifp, struct mbuf **mp, struct sockaddr *dst, short *tp, int *hlen); #ifdef NETATALK #include #include #include #define llc_snap_org_code llc_un.type_snap.org_code #define llc_snap_ether_type llc_un.type_snap.ether_type extern u_char at_org_code[3]; extern u_char aarp_org_code[3]; #endif /* NETATALK */ #include #ifdef CTASSERT CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); int (*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_attach_p)(struct ifnet *ifp); void (*ng_ether_detach_p)(struct ifnet *ifp); void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* if_bridge(4) support */ struct mbuf *(*bridge_input_p)(struct ifnet *, struct mbuf *); int (*bridge_output_p)(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); void (*bridge_dn_p)(struct mbuf *, struct ifnet *); /* if_lagg(4) support */ struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int ether_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); /* XXX: should be in an arp support file, not here */ MALLOC_DEFINE(M_ARPCOM, "arpcom", "802.* interface internals"); #define ETHER_IS_BROADCAST(addr) \ (bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0) #define senderr(e) do { error = (e); goto bad;} while (0) #if defined(INET) || defined(INET6) int ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, struct ip_fw **rule, int shared); +#ifdef VIMAGE_GLOBALS static int ether_ipfw; +#endif #endif /* * Ethernet output routine. * Encapsulate a packet of type family for the local net. * Use trailer local net encapsulation if enough data in first * packet leaves a multiple of 512 bytes of data in remainder. */ int ether_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct rtentry *rt0) { short type; int error, hdrcmplt = 0; u_char esrc[ETHER_ADDR_LEN], edst[ETHER_ADDR_LEN]; struct ether_header *eh; struct pf_mtag *t; int loop_copy = 1; int hlen; /* link layer header length */ #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); hlen = ETHER_HDR_LEN; switch (dst->sa_family) { #ifdef INET case AF_INET: error = arpresolve(ifp, rt0, m, dst, edst); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); break; case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_ETHER); loop_copy = 0; /* if this is for us, don't do it */ switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: type = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: type = htons(ETHERTYPE_ARP); break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, edst, ETHER_ADDR_LEN); else bcopy(ar_tha(ah), edst, ETHER_ADDR_LEN); } break; #endif #ifdef INET6 case AF_INET6: error = nd6_storelladdr(ifp, rt0, m, dst, (u_char *)edst); if (error) return error; type = htons(ETHERTYPE_IPV6); break; #endif #ifdef IPX case AF_IPX: if (ef_outputp) { error = ef_outputp(ifp, &m, dst, &type, &hlen); if (error) goto bad; } else type = htons(ETHERTYPE_IPX); bcopy((caddr_t)&(((struct sockaddr_ipx *)dst)->sipx_addr.x_host), (caddr_t)edst, sizeof (edst)); break; #endif #ifdef NETATALK case AF_APPLETALK: { struct at_ifaddr *aa; if ((aa = at_ifawithnet((struct sockaddr_at *)dst)) == NULL) senderr(EHOSTUNREACH); /* XXX */ if (!aarpresolve(ifp, m, (struct sockaddr_at *)dst, edst)) return (0); /* * In the phase 2 case, need to prepend an mbuf for the llc header. */ if ( aa->aa_flags & AFA_PHASE2 ) { struct llc llc; M_PREPEND(m, LLC_SNAPFRAMELEN, M_DONTWAIT); if (m == NULL) senderr(ENOBUFS); llc.llc_dsap = llc.llc_ssap = LLC_SNAP_LSAP; llc.llc_control = LLC_UI; bcopy(at_org_code, llc.llc_snap_org_code, sizeof(at_org_code)); llc.llc_snap_ether_type = htons( ETHERTYPE_AT ); bcopy(&llc, mtod(m, caddr_t), LLC_SNAPFRAMELEN); type = htons(m->m_pkthdr.len); hlen = LLC_SNAPFRAMELEN + ETHER_HDR_LEN; } else { type = htons(ETHERTYPE_AT); } break; } #endif /* NETATALK */ case pseudo_AF_HDRCMPLT: hdrcmplt = 1; eh = (struct ether_header *)dst->sa_data; (void)memcpy(esrc, eh->ether_shost, sizeof (esrc)); /* FALLTHROUGH */ case AF_UNSPEC: loop_copy = 0; /* if this is for us, don't do it */ eh = (struct ether_header *)dst->sa_data; (void)memcpy(edst, eh->ether_dhost, sizeof (edst)); type = eh->ether_type; break; default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); senderr(EAFNOSUPPORT); } /* * Add local net header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT); if (m == NULL) senderr(ENOBUFS); eh = mtod(m, struct ether_header *); (void)memcpy(&eh->ether_type, &type, sizeof(eh->ether_type)); (void)memcpy(eh->ether_dhost, edst, sizeof (edst)); if (hdrcmplt) (void)memcpy(eh->ether_shost, esrc, sizeof(eh->ether_shost)); else (void)memcpy(eh->ether_shost, IF_LLADDR(ifp), sizeof(eh->ether_shost)); /* * If a simplex interface, and the packet is being sent to our * Ethernet address or a broadcast address, loopback a copy. * XXX To make a simplex device behave exactly like a duplex * device, we should copy in the case of sending to our own * ethernet address (thus letting the original actually appear * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ if ((ifp->if_flags & IFF_SIMPLEX) && loop_copy && ((t = pf_find_mtag(m)) == NULL || !t->routed)) { int csum_flags = 0; if (m->m_pkthdr.csum_flags & CSUM_IP) csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); if (m->m_flags & M_BCAST) { struct mbuf *n; /* * Because if_simloop() modifies the packet, we need a * writable copy through m_dup() instead of a readonly * one as m_copy[m] would give us. The alternative would * be to modify if_simloop() to handle the readonly mbuf, * but performancewise it is mostly equivalent (trading * extra data copying vs. extra locking). * * XXX This is a local workaround. A number of less * often used kernel parts suffer from the same bug. * See PR kern/105943 for a proposed general solution. */ if ((n = m_dup(m, M_DONTWAIT)) != NULL) { n->m_pkthdr.csum_flags |= csum_flags; if (csum_flags & CSUM_DATA_VALID) n->m_pkthdr.csum_data = 0xffff; (void)if_simloop(ifp, n, dst->sa_family, hlen); } else ifp->if_iqdrops++; } else if (bcmp(eh->ether_dhost, eh->ether_shost, ETHER_ADDR_LEN) == 0) { m->m_pkthdr.csum_flags |= csum_flags; if (csum_flags & CSUM_DATA_VALID) m->m_pkthdr.csum_data = 0xffff; (void) if_simloop(ifp, m, dst->sa_family, hlen); return (0); /* XXX */ } } /* * Bridges require special output handling. */ if (ifp->if_bridge) { BRIDGE_OUTPUT(ifp, m, error); return (error); } #ifdef DEV_CARP if (ifp->if_carp && (error = carp_output(ifp, m, dst, NULL))) goto bad; #endif /* Handle ng_ether(4) processing, if any */ if (IFP2AC(ifp)->ac_netgraph != NULL) { KASSERT(ng_ether_output_p != NULL, ("ng_ether_output_p is NULL")); if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) { bad: if (m != NULL) m_freem(m); return (error); } if (m == NULL) return (0); } /* Continue with link-layer output */ return ether_output_frame(ifp, m); } /* * Ethernet link layer output routine to send a raw frame to the device. * * This assumes that the 14 byte Ethernet header is present and contiguous * in the first mbuf (if BRIDGE'ing). */ int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { int error; #if defined(INET) || defined(INET6) INIT_VNET_NET(ifp->if_vnet); struct ip_fw *rule = ip_dn_claim_rule(m); if (IPFW_LOADED && V_ether_ipfw != 0) { if (ether_ipfw_chk(&m, ifp, &rule, 0) == 0) { if (m) { m_freem(m); return EACCES; /* pkt dropped */ } else return 0; /* consumed e.g. in a pipe */ } } #endif /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ IFQ_HANDOFF(ifp, m, error); return (error); } #if defined(INET) || defined(INET6) /* * ipfw processing for ethernet packets (in and out). * The second parameter is NULL from ether_demux, and ifp from * ether_output_frame. */ int ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, struct ip_fw **rule, int shared) { INIT_VNET_IPFW(dst->if_vnet); struct ether_header *eh; struct ether_header save_eh; struct mbuf *m; int i; struct ip_fw_args args; if (*rule != NULL && V_fw_one_pass) return 1; /* dummynet packet, already partially processed */ /* * I need some amt of data to be contiguous, and in case others need * the packet (shared==1) also better be in the first mbuf. */ m = *m0; i = min( m->m_pkthdr.len, max_protohdr); if ( shared || m->m_len < i) { m = m_pullup(m, i); if (m == NULL) { *m0 = m; return 0; } } eh = mtod(m, struct ether_header *); save_eh = *eh; /* save copy for restore below */ m_adj(m, ETHER_HDR_LEN); /* strip ethernet header */ args.m = m; /* the packet we are looking at */ args.oif = dst; /* destination, if any */ args.rule = *rule; /* matching rule to restart */ args.next_hop = NULL; /* we do not support forward yet */ args.eh = &save_eh; /* MAC header for bridged/MAC packets */ args.inp = NULL; /* used by ipfw uid/gid/jail rules */ i = ip_fw_chk_ptr(&args); m = args.m; if (m != NULL) { /* * Restore Ethernet header, as needed, in case the * mbuf chain was replaced by ipfw. */ M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT); if (m == NULL) { *m0 = m; return 0; } if (eh != mtod(m, struct ether_header *)) bcopy(&save_eh, mtod(m, struct ether_header *), ETHER_HDR_LEN); } *m0 = m; *rule = args.rule; if (i == IP_FW_DENY) /* drop */ return 0; KASSERT(m != NULL, ("ether_ipfw_chk: m is NULL")); if (i == IP_FW_PASS) /* a PASS rule. */ return 1; if (DUMMYNET_LOADED && (i == IP_FW_DUMMYNET)) { /* * Pass the pkt to dummynet, which consumes it. * If shared, make a copy and keep the original. */ if (shared) { m = m_copypacket(m, M_DONTWAIT); if (m == NULL) return 0; } else { /* * Pass the original to dummynet and * nothing back to the caller */ *m0 = NULL ; } ip_dn_io_ptr(&m, dst ? DN_TO_ETH_OUT: DN_TO_ETH_DEMUX, &args); return 0; } /* * XXX at some point add support for divert/forward actions. * If none of the above matches, we have to drop the pkt. */ return 0; } #endif /* * Process a received Ethernet packet; the packet is in the * mbuf chain m with the ethernet header at the front. */ static void ether_input(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; u_short etype; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } #ifdef DIAGNOSTIC if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n"); m_freem(m); return; } #endif /* * Do consistency checks to verify assumptions * made by code past this point. */ if ((m->m_flags & M_PKTHDR) == 0) { if_printf(ifp, "discard frame w/o packet header\n"); ifp->if_ierrors++; m_freem(m); return; } if (m->m_len < ETHER_HDR_LEN) { /* XXX maybe should pullup? */ if_printf(ifp, "discard frame w/o leading ethernet " "header (len %u pkt len %u)\n", m->m_len, m->m_pkthdr.len); ifp->if_ierrors++; m_freem(m); return; } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); if (m->m_pkthdr.rcvif == NULL) { if_printf(ifp, "discard frame w/o interface pointer\n"); ifp->if_ierrors++; m_freem(m); return; } #ifdef DIAGNOSTIC if (m->m_pkthdr.rcvif != ifp) { if_printf(ifp, "Warning, frame marked as received on %s\n", m->m_pkthdr.rcvif->if_xname); } #endif if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; ifp->if_imcasts++; } #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* * Give bpf a chance at the packet. */ ETHER_BPF_MTAP(ifp, m); /* * If the CRC is still on the packet, trim it off. We do this once * and once only in case we are re-entered. Nothing else on the * Ethernet receive path expects to see the FCS. */ if (m->m_flags & M_HASFCS) { m_adj(m, -ETHER_CRC_LEN); m->m_flags &= ~M_HASFCS; } ifp->if_ibytes += m->m_pkthdr.len; /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); return; } /* Handle input from a lagg(4) port */ if (ifp->if_type == IFT_IEEE8023ADLAG) { KASSERT(lagg_input_p != NULL, ("%s: if_lagg not loaded!", __func__)); m = (*lagg_input_p)(ifp, m); if (m != NULL) ifp = m->m_pkthdr.rcvif; else return; } /* * If the hardware did not process an 802.1Q tag, do this now, * to allow 802.1P priority frames to be passed to the main input * path correctly. * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels. */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) { struct ether_vlan_header *evl; if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { #ifdef DIAGNOSTIC if_printf(ifp, "cannot pullup VLAN header\n"); #endif ifp->if_ierrors++; m_freem(m); return; } evl = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); m->m_flags |= M_VLANTAG; bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); } /* Allow ng_ether(4) to claim this frame. */ if (IFP2AC(ifp)->ac_netgraph != NULL) { KASSERT(ng_ether_input_p != NULL, ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); if (m == NULL) return; } /* * Allow if_bridge(4) to claim this frame. * The BRIDGE_INPUT() macro will update ifp if the bridge changed it * and the frame should be delivered locally. */ if (ifp->if_bridge != NULL) { m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); if (m == NULL) return; } #ifdef DEV_CARP /* * Clear M_PROMISC on frame so that carp(4) will see it when the * mbuf flows up to Layer 3. * FreeBSD's implementation of carp(4) uses the inprotosw * to dispatch IPPROTO_CARP. carp(4) also allocates its own * Ethernet addresses of the form 00:00:5e:00:01:xx, which * is outside the scope of the M_PROMISC test below. * TODO: Maintain a hash table of ethernet addresses other than * ether_dhost which may be active on this ifp. */ if (ifp->if_carp && carp_forus(ifp->if_carp, eh->ether_dhost)) { m->m_flags &= ~M_PROMISC; } else #endif { /* * If the frame received was not for our MAC address, set the * M_PROMISC flag on the mbuf chain. The frame may need to * be seen by the rest of the Ethernet input path in case of * re-entry (e.g. bridge, vlan, netgraph) but should not be * seen by upper protocol layers. */ if (!ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0) m->m_flags |= M_PROMISC; } /* First chunk of an mbuf contains good entropy */ if (harvest.ethernet) random_harvest(m, 16, 3, 0, RANDOM_NET); ether_demux(ifp, m); } /* * Upper layer processing for a received Ethernet packet. */ void ether_demux(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; int isr; u_short ether_type; #if defined(NETATALK) struct llc *l; #endif KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); #if defined(INET) || defined(INET6) INIT_VNET_NET(ifp->if_vnet); /* * Allow dummynet and/or ipfw to claim the frame. * Do not do this for PROMISC frames in case we are re-entered. */ if (IPFW_LOADED && V_ether_ipfw != 0 && !(m->m_flags & M_PROMISC)) { struct ip_fw *rule = ip_dn_claim_rule(m); if (ether_ipfw_chk(&m, NULL, &rule, 0) == 0) { if (m) m_freem(m); /* dropped; free mbuf chain */ return; /* consumed */ } } #endif eh = mtod(m, struct ether_header *); ether_type = ntohs(eh->ether_type); /* * If this frame has a VLAN tag other than 0, call vlan_input() * if its module is loaded. Otherwise, drop. */ if ((m->m_flags & M_VLANTAG) && EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) { if (ifp->if_vlantrunk == NULL) { ifp->if_noproto++; m_freem(m); return; } KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!", __func__)); /* Clear before possibly re-entering ether_input(). */ m->m_flags &= ~M_PROMISC; (*vlan_input_p)(ifp, m); return; } /* * Pass promiscuously received frames to the upper layer if the user * requested this by setting IFF_PPROMISC. Otherwise, drop them. */ if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) { m_freem(m); return; } /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Ethernet header. */ m->m_flags &= ~M_VLANTAG; m->m_flags &= ~(M_PROTOFLAGS); m_adj(m, ETHER_HDR_LEN); /* * Dispatch frame to upper layer. */ switch (ether_type) { #ifdef INET case ETHERTYPE_IP: if ((m = ip_fastforward(m)) == NULL) return; isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef IPX case ETHERTYPE_IPX: if (ef_inputp && ef_inputp(ifp, eh, m) == 0) return; isr = NETISR_IPX; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif #ifdef NETATALK case ETHERTYPE_AT: isr = NETISR_ATALK1; break; case ETHERTYPE_AARP: isr = NETISR_AARP; break; #endif /* NETATALK */ default: #ifdef IPX if (ef_inputp && ef_inputp(ifp, eh, m) == 0) return; #endif /* IPX */ #if defined(NETATALK) if (ether_type > ETHERMTU) goto discard; l = mtod(m, struct llc *); if (l->llc_dsap == LLC_SNAP_LSAP && l->llc_ssap == LLC_SNAP_LSAP && l->llc_control == LLC_UI) { if (bcmp(&(l->llc_snap_org_code)[0], at_org_code, sizeof(at_org_code)) == 0 && ntohs(l->llc_snap_ether_type) == ETHERTYPE_AT) { m_adj(m, LLC_SNAPFRAMELEN); isr = NETISR_ATALK2; break; } if (bcmp(&(l->llc_snap_org_code)[0], aarp_org_code, sizeof(aarp_org_code)) == 0 && ntohs(l->llc_snap_ether_type) == ETHERTYPE_AARP) { m_adj(m, LLC_SNAPFRAMELEN); isr = NETISR_AARP; break; } } #endif /* NETATALK */ goto discard; } netisr_dispatch(isr, m); return; discard: /* * Packet is to be discarded. If netgraph is present, * hand the packet to it for last chance processing; * otherwise dispose of it. */ if (IFP2AC(ifp)->ac_netgraph != NULL) { KASSERT(ng_ether_input_orphan_p != NULL, ("ng_ether_input_orphan_p is NULL")); /* * Put back the ethernet header so netgraph has a * consistent view of inbound packets. */ M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT); (*ng_ether_input_orphan_p)(ifp, m); return; } m_freem(m); } /* * Convert Ethernet address to printable (loggable) representation. * This routine is for compatibility; it's better to just use * * printf("%6D", , ":"); * * since there's no static buffer involved. */ char * ether_sprintf(const u_char *ap) { static char etherbuf[18]; snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":"); return (etherbuf); } /* * Perform common duties while attaching to interface list */ void ether_ifattach(struct ifnet *ifp, const u_int8_t *lla) { int i; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifp->if_addrlen = ETHER_ADDR_LEN; ifp->if_hdrlen = ETHER_HDR_LEN; if_attach(ifp); ifp->if_mtu = ETHERMTU; ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ ifp->if_broadcastaddr = etherbroadcastaddr; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_ETHER; sdl->sdl_alen = ifp->if_addrlen; bcopy(lla, LLADDR(sdl), ifp->if_addrlen); bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); if (ng_ether_attach_p != NULL) (*ng_ether_attach_p)(ifp); /* Announce Ethernet MAC address if non-zero. */ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; if (i != ifp->if_addrlen) if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); } /* * Perform common duties while detaching an Ethernet interface */ void ether_ifdetach(struct ifnet *ifp) { if (IFP2AC(ifp)->ac_netgraph != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } bpfdetach(ifp); if_detach(ifp); } SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); #if defined(INET) || defined(INET6) SYSCTL_V_INT(V_NET, vnet_net, _net_link_ether, OID_AUTO, ipfw, CTLFLAG_RW, ether_ipfw, 0, "Pass ether pkts through firewall"); #endif #if 0 /* * This is for reference. We have a table-driven version * of the little-endian crc32 generator, which is faster * than the double-loop. */ uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { size_t i; uint32_t crc; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = (crc ^ data) & 1; crc >>= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_LE); } } return (crc); } #else uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { static const uint32_t crctab[] = { 0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c }; size_t i; uint32_t crc; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { crc ^= buf[i]; crc = (crc >> 4) ^ crctab[crc & 0xf]; crc = (crc >> 4) ^ crctab[crc & 0xf]; } return (crc); } #endif uint32_t ether_crc32_be(const uint8_t *buf, size_t len) { size_t i; uint32_t crc, carry; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01); crc <<= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_BE) | carry; } } return (crc); } int ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif #ifdef IPX /* * XXX - This code is probably wrong */ case AF_IPX: { struct ipx_addr *ina = &(IA_SIPX(ifa)->sipx_addr); if (ipx_nullhost(*ina)) ina->x_host = *(union ipx_host *) IF_LLADDR(ifp); else { bcopy((caddr_t) ina->x_host.c_host, (caddr_t) IF_LLADDR(ifp), ETHER_ADDR_LEN); } /* * Set new address */ ifp->if_init(ifp->if_softc); break; } #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: { struct sockaddr *sa; sa = (struct sockaddr *) & ifr->ifr_data; bcopy(IF_LLADDR(ifp), (caddr_t) sa->sa_data, ETHER_ADDR_LEN); } break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > ETHERMTU) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; default: error = EINVAL; /* XXX netbsd has ENOTTY??? */ break; } return (error); } static int ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!ETHER_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = 0; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = malloc(sizeof *sdl, M_IFMADDR, M_NOWAIT|M_ZERO); if (sdl == NULL) return ENOMEM; sdl->sdl_len = sizeof *sdl; sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = IFT_ETHER; sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = 0; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = malloc(sizeof *sdl, M_IFMADDR, M_NOWAIT|M_ZERO); if (sdl == NULL) return (ENOMEM); sdl->sdl_len = sizeof *sdl; sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = IFT_ETHER; sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return EAFNOSUPPORT; } } static void* ether_alloc(u_char type, struct ifnet *ifp) { struct arpcom *ac; ac = malloc(sizeof(struct arpcom), M_ARPCOM, M_WAITOK | M_ZERO); ac->ac_ifp = ifp; return (ac); } static void ether_free(void *com, u_char type) { free(com, M_ARPCOM); } static int ether_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: if_register_com_alloc(IFT_ETHER, ether_alloc, ether_free); break; case MOD_UNLOAD: if_deregister_com_alloc(IFT_ETHER); break; default: return EOPNOTSUPP; } return (0); } static moduledata_t ether_mod = { "ether", ether_modevent, 0 }; void ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen) { struct ether_vlan_header vlan; struct mbuf mv, mb; KASSERT((m->m_flags & M_VLANTAG) != 0, ("%s: vlan information not present", __func__)); KASSERT(m->m_len >= sizeof(struct ether_header), ("%s: mbuf not large enough for header", __func__)); bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header)); vlan.evl_proto = vlan.evl_encap_proto; vlan.evl_encap_proto = htons(ETHERTYPE_VLAN); vlan.evl_tag = htons(m->m_pkthdr.ether_vtag); m->m_len -= sizeof(struct ether_header); m->m_data += sizeof(struct ether_header); /* * If a data link has been supplied by the caller, then we will need to * re-create a stack allocated mbuf chain with the following structure: * * (1) mbuf #1 will contain the supplied data link * (2) mbuf #2 will contain the vlan header * (3) mbuf #3 will contain the original mbuf's packet data * * Otherwise, submit the packet and vlan header via bpf_mtap2(). */ if (data != NULL) { mv.m_next = m; mv.m_data = (caddr_t)&vlan; mv.m_len = sizeof(vlan); mb.m_next = &mv; mb.m_data = data; mb.m_len = dlen; bpf_mtap(bp, &mb); } else bpf_mtap2(bp, &vlan, sizeof(vlan), m); m->m_len += sizeof(struct ether_header); m->m_data -= sizeof(struct ether_header); } struct mbuf * ether_vlanencap(struct mbuf *m, uint16_t tag) { struct ether_vlan_header *evl; M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT); if (m == NULL) return (NULL); /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ if (m->m_len < sizeof(*evl)) { m = m_pullup(m, sizeof(*evl)); if (m == NULL) return (NULL); } /* * Transform the Ethernet header into an Ethernet header * with 802.1Q encapsulation. */ evl = mtod(m, struct ether_vlan_header *); bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN, (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN); evl->evl_encap_proto = htons(ETHERTYPE_VLAN); evl->evl_tag = htons(tag); return (m); } DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(ether, 1); Index: head/sys/net/if_gif.c =================================================================== --- head/sys/net/if_gif.c (revision 185087) +++ head/sys/net/if_gif.c (revision 185088) @@ -1,995 +1,1007 @@ /* $FreeBSD$ */ /* $KAME: if_gif.c,v 1.87 2001/10/19 08:50:27 itojun Exp $ */ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #include #endif /* INET */ #ifdef INET6 #ifndef INET #include #endif #include #include #include #include #include #include #endif /* INET6 */ #include #include #include #include #include #define GIFNAME "gif" /* * gif_mtx protects the global gif_softc_list. */ static struct mtx gif_mtx; static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); + +#ifdef VIMAGE_GLOBALS static LIST_HEAD(, gif_softc) gif_softc_list; +static int max_gif_nesting; +static int parallel_tunnels; +#ifdef INET +int ip_gif_ttl; +#endif +#ifdef INET6 +int ip6_gif_hlim; +#endif +#endif void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af); void (*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af); void (*ng_gif_attach_p)(struct ifnet *ifp); void (*ng_gif_detach_p)(struct ifnet *ifp); static void gif_start(struct ifnet *); static int gif_clone_create(struct if_clone *, int, caddr_t); static void gif_clone_destroy(struct ifnet *); IFC_SIMPLE_DECLARE(gif, 0); static int gifmodevent(module_t, int, void *); SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_GIF, gif, CTLFLAG_RW, 0, "Generic Tunnel Interface"); #ifndef MAX_GIF_NEST /* * This macro controls the default upper limitation on nesting of gif tunnels. * Since, setting a large value to this macro with a careless configuration * may introduce system crash, we don't allow any nestings by default. * If you need to configure nested gif tunnels, you can define this macro * in your kernel configuration file. However, if you do so, please be * careful to configure the tunnels so that it won't make a loop. */ #define MAX_GIF_NEST 1 #endif -#ifndef VIMAGE -static int max_gif_nesting = MAX_GIF_NEST; -#endif SYSCTL_V_INT(V_NET, vnet_gif, _net_link_gif, OID_AUTO, max_nesting, CTLFLAG_RW, max_gif_nesting, 0, "Max nested tunnels"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6); SYSCTL_V_INT(V_NET, vnet_gif, _net_inet6_ip6, IPV6CTL_GIF_HLIM, gifhlim, CTLFLAG_RW, ip6_gif_hlim, 0, ""); #endif /* * By default, we disallow creation of multiple tunnels between the same * pair of addresses. Some applications require this functionality so * we allow control over this check here. */ -#ifdef XBONEHACK -static int parallel_tunnels = 1; -#else -static int parallel_tunnels = 0; -#endif SYSCTL_V_INT(V_NET, vnet_gif, _net_link_gif, OID_AUTO, parallel_tunnels, CTLFLAG_RW, parallel_tunnels, 0, "Allow parallel tunnels?"); /* copy from src/sys/net/if_ethersubr.c */ static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; #ifndef ETHER_IS_BROADCAST #define ETHER_IS_BROADCAST(addr) \ (bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0) #endif static int gif_clone_create(ifc, unit, params) struct if_clone *ifc; int unit; caddr_t params; { INIT_VNET_GIF(curvnet); struct gif_softc *sc; sc = malloc(sizeof(struct gif_softc), M_GIF, M_WAITOK | M_ZERO); sc->gif_fibnum = curthread->td_proc->p_fibnum; GIF2IFP(sc) = if_alloc(IFT_GIF); if (GIF2IFP(sc) == NULL) { free(sc, M_GIF); return (ENOSPC); } GIF_LOCK_INIT(sc); GIF2IFP(sc)->if_softc = sc; if_initname(GIF2IFP(sc), ifc->ifc_name, unit); sc->encap_cookie4 = sc->encap_cookie6 = NULL; GIF2IFP(sc)->if_addrlen = 0; GIF2IFP(sc)->if_mtu = GIF_MTU; GIF2IFP(sc)->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; #if 0 /* turn off ingress filter */ GIF2IFP(sc)->if_flags |= IFF_LINK2; #endif GIF2IFP(sc)->if_ioctl = gif_ioctl; GIF2IFP(sc)->if_start = gif_start; GIF2IFP(sc)->if_output = gif_output; GIF2IFP(sc)->if_snd.ifq_maxlen = IFQ_MAXLEN; if_attach(GIF2IFP(sc)); bpfattach(GIF2IFP(sc), DLT_NULL, sizeof(u_int32_t)); if (ng_gif_attach_p != NULL) (*ng_gif_attach_p)(GIF2IFP(sc)); mtx_lock(&gif_mtx); LIST_INSERT_HEAD(&V_gif_softc_list, sc, gif_list); mtx_unlock(&gif_mtx); return (0); } static void gif_clone_destroy(ifp) struct ifnet *ifp; { #if defined(INET) || defined(INET6) int err; #endif struct gif_softc *sc = ifp->if_softc; mtx_lock(&gif_mtx); LIST_REMOVE(sc, gif_list); mtx_unlock(&gif_mtx); gif_delete_tunnel(ifp); #ifdef INET6 if (sc->encap_cookie6 != NULL) { err = encap_detach(sc->encap_cookie6); KASSERT(err == 0, ("Unexpected error detaching encap_cookie6")); } #endif #ifdef INET if (sc->encap_cookie4 != NULL) { err = encap_detach(sc->encap_cookie4); KASSERT(err == 0, ("Unexpected error detaching encap_cookie4")); } #endif if (ng_gif_detach_p != NULL) (*ng_gif_detach_p)(ifp); bpfdetach(ifp); if_detach(ifp); if_free(ifp); GIF_LOCK_DESTROY(sc); free(sc, M_GIF); } static int gifmodevent(mod, type, data) module_t mod; int type; void *data; { switch (type) { case MOD_LOAD: mtx_init(&gif_mtx, "gif_mtx", NULL, MTX_DEF); - LIST_INIT(&V_gif_softc_list); - if_clone_attach(&gif_cloner); + LIST_INIT(&V_gif_softc_list); + V_max_gif_nesting = MAX_GIF_NEST; +#ifdef XBONEHACK + V_parallel_tunnels = 1; +#else + V_parallel_tunnels = 0; +#endif +#ifdef INET + V_ip_gif_ttl = GIF_TTL; +#endif #ifdef INET6 V_ip6_gif_hlim = GIF_HLIM; #endif + if_clone_attach(&gif_cloner); break; case MOD_UNLOAD: if_clone_detach(&gif_cloner); mtx_destroy(&gif_mtx); #ifdef INET6 V_ip6_gif_hlim = 0; #endif break; default: return EOPNOTSUPP; } return 0; } static moduledata_t gif_mod = { "if_gif", gifmodevent, 0 }; DECLARE_MODULE(if_gif, gif_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_gif, 1); int gif_encapcheck(m, off, proto, arg) const struct mbuf *m; int off; int proto; void *arg; { struct ip ip; struct gif_softc *sc; sc = (struct gif_softc *)arg; if (sc == NULL) return 0; if ((GIF2IFP(sc)->if_flags & IFF_UP) == 0) return 0; /* no physical address */ if (!sc->gif_psrc || !sc->gif_pdst) return 0; switch (proto) { #ifdef INET case IPPROTO_IPV4: break; #endif #ifdef INET6 case IPPROTO_IPV6: break; #endif case IPPROTO_ETHERIP: break; default: return 0; } /* Bail on short packets */ if (m->m_pkthdr.len < sizeof(ip)) return 0; m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); switch (ip.ip_v) { #ifdef INET case 4: if (sc->gif_psrc->sa_family != AF_INET || sc->gif_pdst->sa_family != AF_INET) return 0; return gif_encapcheck4(m, off, proto, arg); #endif #ifdef INET6 case 6: if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) return 0; if (sc->gif_psrc->sa_family != AF_INET6 || sc->gif_pdst->sa_family != AF_INET6) return 0; return gif_encapcheck6(m, off, proto, arg); #endif default: return 0; } } static void gif_start(struct ifnet *ifp) { struct gif_softc *sc; struct mbuf *m; sc = ifp->if_softc; ifp->if_drv_flags |= IFF_DRV_OACTIVE; for (;;) { IFQ_DEQUEUE(&ifp->if_snd, m); if (m == 0) break; gif_output(ifp, m, sc->gif_pdst, NULL); } ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; return; } int gif_output(ifp, m, dst, rt) struct ifnet *ifp; struct mbuf *m; struct sockaddr *dst; struct rtentry *rt; /* added in net2 */ { INIT_VNET_GIF(ifp->if_vnet); struct gif_softc *sc = ifp->if_softc; struct m_tag *mtag; int error = 0; int gif_called; u_int32_t af; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); goto end; } #endif /* * gif may cause infinite recursion calls when misconfigured. * We'll prevent this by detecting loops. * * High nesting level may cause stack exhaustion. * We'll prevent this by introducing upper limit. */ gif_called = 1; mtag = m_tag_locate(m, MTAG_GIF, MTAG_GIF_CALLED, NULL); while (mtag != NULL) { if (*(struct ifnet **)(mtag + 1) == ifp) { log(LOG_NOTICE, "gif_output: loop detected on %s\n", (*(struct ifnet **)(mtag + 1))->if_xname); m_freem(m); error = EIO; /* is there better errno? */ goto end; } mtag = m_tag_locate(m, MTAG_GIF, MTAG_GIF_CALLED, mtag); gif_called++; } if (gif_called > V_max_gif_nesting) { log(LOG_NOTICE, "gif_output: recursively called too many times(%d)\n", gif_called); m_freem(m); error = EIO; /* is there better errno? */ goto end; } mtag = m_tag_alloc(MTAG_GIF, MTAG_GIF_CALLED, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) { m_freem(m); error = ENOMEM; goto end; } *(struct ifnet **)(mtag + 1) = ifp; m_tag_prepend(m, mtag); m->m_flags &= ~(M_BCAST|M_MCAST); GIF_LOCK(sc); if (!(ifp->if_flags & IFF_UP) || sc->gif_psrc == NULL || sc->gif_pdst == NULL) { GIF_UNLOCK(sc); m_freem(m); error = ENETDOWN; goto end; } /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) { bcopy(dst->sa_data, &af, sizeof(af)); dst->sa_family = af; } af = dst->sa_family; BPF_MTAP2(ifp, &af, sizeof(af), m); ifp->if_opackets++; ifp->if_obytes += m->m_pkthdr.len; /* override to IPPROTO_ETHERIP for bridged traffic */ if (ifp->if_bridge) af = AF_LINK; M_SETFIB(m, sc->gif_fibnum); /* inner AF-specific encapsulation */ /* XXX should we check if our outer source is legal? */ /* dispatch to output logic based on outer AF */ switch (sc->gif_psrc->sa_family) { #ifdef INET case AF_INET: error = in_gif_output(ifp, af, m); break; #endif #ifdef INET6 case AF_INET6: error = in6_gif_output(ifp, af, m); break; #endif default: m_freem(m); error = ENETDOWN; } GIF_UNLOCK(sc); end: if (error) ifp->if_oerrors++; return (error); } void gif_input(m, af, ifp) struct mbuf *m; int af; struct ifnet *ifp; { int isr, n; struct etherip_header *eip; struct ether_header *eh; struct ifnet *oldifp; if (ifp == NULL) { /* just in case */ m_freem(m); return; } m->m_pkthdr.rcvif = ifp; #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (bpf_peers_present(ifp->if_bpf)) { u_int32_t af1 = af; bpf_mtap2(ifp->if_bpf, &af1, sizeof(af1), m); } if (ng_gif_input_p != NULL) { (*ng_gif_input_p)(ifp, &m, af); if (m == NULL) return; } /* * Put the packet to the network layer input queue according to the * specified address family. * Note: older versions of gif_input directly called network layer * input functions, e.g. ip6_input, here. We changed the policy to * prevent too many recursive calls of such input functions, which * might cause kernel panic. But the change may introduce another * problem; if the input queue is full, packets are discarded. * The kernel stack overflow really happened, and we believed * queue-full rarely occurs, so we changed the policy. */ switch (af) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif case AF_LINK: n = sizeof(struct etherip_header) + sizeof(struct ether_header); if (n > m->m_len) { m = m_pullup(m, n); if (m == NULL) { ifp->if_ierrors++; return; } } eip = mtod(m, struct etherip_header *); if (eip->eip_ver != (ETHERIP_VERSION & ETHERIP_VER_VERS_MASK)) { /* discard unknown versions */ m_freem(m); return; } m_adj(m, sizeof(struct etherip_header)); m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.rcvif = ifp; if (ifp->if_bridge) { oldifp = ifp; eh = mtod(m, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; ifp->if_imcasts++; } BRIDGE_INPUT(ifp, m); if (m != NULL && ifp != oldifp) { /* * The bridge gave us back itself or one of the * members for which the frame is addressed. */ ether_demux(ifp, m); return; } } if (m != NULL) m_freem(m); return; default: if (ng_gif_input_orphan_p != NULL) (*ng_gif_input_orphan_p)(ifp, m, af); else m_freem(m); return; } ifp->if_ipackets++; ifp->if_ibytes += m->m_pkthdr.len; netisr_dispatch(isr, m); } /* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */ int gif_ioctl(ifp, cmd, data) struct ifnet *ifp; u_long cmd; caddr_t data; { struct gif_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq*)data; int error = 0, size; struct sockaddr *dst, *src; #ifdef SIOCSIFMTU /* xxx */ u_long mtu; #endif switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; break; case SIOCSIFDSTADDR: break; case SIOCADDMULTI: case SIOCDELMULTI: break; #ifdef SIOCSIFMTU /* xxx */ case SIOCGIFMTU: break; case SIOCSIFMTU: mtu = ifr->ifr_mtu; if (mtu < GIF_MTU_MIN || mtu > GIF_MTU_MAX) return (EINVAL); ifp->if_mtu = mtu; break; #endif /* SIOCSIFMTU */ #ifdef INET case SIOCSIFPHYADDR: #endif #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif /* INET6 */ case SIOCSLIFPHYADDR: switch (cmd) { #ifdef INET case SIOCSIFPHYADDR: src = (struct sockaddr *) &(((struct in_aliasreq *)data)->ifra_addr); dst = (struct sockaddr *) &(((struct in_aliasreq *)data)->ifra_dstaddr); break; #endif #ifdef INET6 case SIOCSIFPHYADDR_IN6: src = (struct sockaddr *) &(((struct in6_aliasreq *)data)->ifra_addr); dst = (struct sockaddr *) &(((struct in6_aliasreq *)data)->ifra_dstaddr); break; #endif case SIOCSLIFPHYADDR: src = (struct sockaddr *) &(((struct if_laddrreq *)data)->addr); dst = (struct sockaddr *) &(((struct if_laddrreq *)data)->dstaddr); break; default: return EINVAL; } /* sa_family must be equal */ if (src->sa_family != dst->sa_family) return EINVAL; /* validate sa_len */ switch (src->sa_family) { #ifdef INET case AF_INET: if (src->sa_len != sizeof(struct sockaddr_in)) return EINVAL; break; #endif #ifdef INET6 case AF_INET6: if (src->sa_len != sizeof(struct sockaddr_in6)) return EINVAL; break; #endif default: return EAFNOSUPPORT; } switch (dst->sa_family) { #ifdef INET case AF_INET: if (dst->sa_len != sizeof(struct sockaddr_in)) return EINVAL; break; #endif #ifdef INET6 case AF_INET6: if (dst->sa_len != sizeof(struct sockaddr_in6)) return EINVAL; break; #endif default: return EAFNOSUPPORT; } /* check sa_family looks sane for the cmd */ switch (cmd) { case SIOCSIFPHYADDR: if (src->sa_family == AF_INET) break; return EAFNOSUPPORT; #ifdef INET6 case SIOCSIFPHYADDR_IN6: if (src->sa_family == AF_INET6) break; return EAFNOSUPPORT; #endif /* INET6 */ case SIOCSLIFPHYADDR: /* checks done in the above */ break; } error = gif_set_tunnel(GIF2IFP(sc), src, dst); break; #ifdef SIOCDIFPHYADDR case SIOCDIFPHYADDR: gif_delete_tunnel(GIF2IFP(sc)); break; #endif case SIOCGIFPSRCADDR: #ifdef INET6 case SIOCGIFPSRCADDR_IN6: #endif /* INET6 */ if (sc->gif_psrc == NULL) { error = EADDRNOTAVAIL; goto bad; } src = sc->gif_psrc; switch (cmd) { #ifdef INET case SIOCGIFPSRCADDR: dst = &ifr->ifr_addr; size = sizeof(ifr->ifr_addr); break; #endif /* INET */ #ifdef INET6 case SIOCGIFPSRCADDR_IN6: dst = (struct sockaddr *) &(((struct in6_ifreq *)data)->ifr_addr); size = sizeof(((struct in6_ifreq *)data)->ifr_addr); break; #endif /* INET6 */ default: error = EADDRNOTAVAIL; goto bad; } if (src->sa_len > size) return EINVAL; bcopy((caddr_t)src, (caddr_t)dst, src->sa_len); #ifdef INET6 if (dst->sa_family == AF_INET6) { error = sa6_recoverscope((struct sockaddr_in6 *)dst); if (error != 0) return (error); } #endif break; case SIOCGIFPDSTADDR: #ifdef INET6 case SIOCGIFPDSTADDR_IN6: #endif /* INET6 */ if (sc->gif_pdst == NULL) { error = EADDRNOTAVAIL; goto bad; } src = sc->gif_pdst; switch (cmd) { #ifdef INET case SIOCGIFPDSTADDR: dst = &ifr->ifr_addr; size = sizeof(ifr->ifr_addr); break; #endif /* INET */ #ifdef INET6 case SIOCGIFPDSTADDR_IN6: dst = (struct sockaddr *) &(((struct in6_ifreq *)data)->ifr_addr); size = sizeof(((struct in6_ifreq *)data)->ifr_addr); break; #endif /* INET6 */ default: error = EADDRNOTAVAIL; goto bad; } if (src->sa_len > size) return EINVAL; bcopy((caddr_t)src, (caddr_t)dst, src->sa_len); #ifdef INET6 if (dst->sa_family == AF_INET6) { error = sa6_recoverscope((struct sockaddr_in6 *)dst); if (error != 0) return (error); } #endif break; case SIOCGLIFPHYADDR: if (sc->gif_psrc == NULL || sc->gif_pdst == NULL) { error = EADDRNOTAVAIL; goto bad; } /* copy src */ src = sc->gif_psrc; dst = (struct sockaddr *) &(((struct if_laddrreq *)data)->addr); size = sizeof(((struct if_laddrreq *)data)->addr); if (src->sa_len > size) return EINVAL; bcopy((caddr_t)src, (caddr_t)dst, src->sa_len); /* copy dst */ src = sc->gif_pdst; dst = (struct sockaddr *) &(((struct if_laddrreq *)data)->dstaddr); size = sizeof(((struct if_laddrreq *)data)->dstaddr); if (src->sa_len > size) return EINVAL; bcopy((caddr_t)src, (caddr_t)dst, src->sa_len); break; case SIOCSIFFLAGS: /* if_ioctl() takes care of it */ break; default: error = EINVAL; break; } bad: return error; } /* * XXXRW: There's a general event-ordering issue here: the code to check * if a given tunnel is already present happens before we perform a * potentially blocking setup of the tunnel. This code needs to be * re-ordered so that the check and replacement can be atomic using * a mutex. */ int gif_set_tunnel(ifp, src, dst) struct ifnet *ifp; struct sockaddr *src; struct sockaddr *dst; { INIT_VNET_GIF(ifp->if_vnet); struct gif_softc *sc = ifp->if_softc; struct gif_softc *sc2; struct sockaddr *osrc, *odst, *sa; int error = 0; mtx_lock(&gif_mtx); LIST_FOREACH(sc2, &V_gif_softc_list, gif_list) { if (sc2 == sc) continue; if (!sc2->gif_pdst || !sc2->gif_psrc) continue; if (sc2->gif_pdst->sa_family != dst->sa_family || sc2->gif_pdst->sa_len != dst->sa_len || sc2->gif_psrc->sa_family != src->sa_family || sc2->gif_psrc->sa_len != src->sa_len) continue; /* * Disallow parallel tunnels unless instructed * otherwise. */ if (!V_parallel_tunnels && bcmp(sc2->gif_pdst, dst, dst->sa_len) == 0 && bcmp(sc2->gif_psrc, src, src->sa_len) == 0) { error = EADDRNOTAVAIL; mtx_unlock(&gif_mtx); goto bad; } /* XXX both end must be valid? (I mean, not 0.0.0.0) */ } mtx_unlock(&gif_mtx); /* XXX we can detach from both, but be polite just in case */ if (sc->gif_psrc) switch (sc->gif_psrc->sa_family) { #ifdef INET case AF_INET: (void)in_gif_detach(sc); break; #endif #ifdef INET6 case AF_INET6: (void)in6_gif_detach(sc); break; #endif } osrc = sc->gif_psrc; sa = (struct sockaddr *)malloc(src->sa_len, M_IFADDR, M_WAITOK); bcopy((caddr_t)src, (caddr_t)sa, src->sa_len); sc->gif_psrc = sa; odst = sc->gif_pdst; sa = (struct sockaddr *)malloc(dst->sa_len, M_IFADDR, M_WAITOK); bcopy((caddr_t)dst, (caddr_t)sa, dst->sa_len); sc->gif_pdst = sa; switch (sc->gif_psrc->sa_family) { #ifdef INET case AF_INET: error = in_gif_attach(sc); break; #endif #ifdef INET6 case AF_INET6: /* * Check validity of the scope zone ID of the addresses, and * convert it into the kernel internal form if necessary. */ error = sa6_embedscope((struct sockaddr_in6 *)sc->gif_psrc, 0); if (error != 0) break; error = sa6_embedscope((struct sockaddr_in6 *)sc->gif_pdst, 0); if (error != 0) break; error = in6_gif_attach(sc); break; #endif } if (error) { /* rollback */ free((caddr_t)sc->gif_psrc, M_IFADDR); free((caddr_t)sc->gif_pdst, M_IFADDR); sc->gif_psrc = osrc; sc->gif_pdst = odst; goto bad; } if (osrc) free((caddr_t)osrc, M_IFADDR); if (odst) free((caddr_t)odst, M_IFADDR); bad: if (sc->gif_psrc && sc->gif_pdst) ifp->if_drv_flags |= IFF_DRV_RUNNING; else ifp->if_drv_flags &= ~IFF_DRV_RUNNING; return error; } void gif_delete_tunnel(ifp) struct ifnet *ifp; { struct gif_softc *sc = ifp->if_softc; if (sc->gif_psrc) { free((caddr_t)sc->gif_psrc, M_IFADDR); sc->gif_psrc = NULL; } if (sc->gif_pdst) { free((caddr_t)sc->gif_pdst, M_IFADDR); sc->gif_pdst = NULL; } /* it is safe to detach from both */ #ifdef INET (void)in_gif_detach(sc); #endif #ifdef INET6 (void)in6_gif_detach(sc); #endif ifp->if_drv_flags &= ~IFF_DRV_RUNNING; } Index: head/sys/net/if_loop.c =================================================================== --- head/sys/net/if_loop.c (revision 185087) +++ head/sys/net/if_loop.c (revision 185088) @@ -1,371 +1,374 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_loop.c 8.2 (Berkeley) 1/9/95 * $FreeBSD$ */ /* * Loopback interface driver for protocol testing and timing. */ #include "opt_atalk.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipx.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #ifdef IPX #include #include #endif #ifdef INET6 #ifndef INET #include #endif #include #include #endif #ifdef NETATALK #include #include #endif #ifdef TINY_LOMTU #define LOMTU (1024+512) #elif defined(LARGE_LOMTU) #define LOMTU 131072 #else #define LOMTU 16384 #endif int loioctl(struct ifnet *, u_long, caddr_t); static void lortrequest(int, struct rtentry *, struct rt_addrinfo *); int looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct rtentry *rt); static int lo_clone_create(struct if_clone *, int, caddr_t); static void lo_clone_destroy(struct ifnet *); -struct ifnet *loif = NULL; /* Used externally */ +#ifdef VIMAGE_GLOBALS +struct ifnet *loif; /* Used externally */ +#endif IFC_SIMPLE_DECLARE(lo, 1); static void lo_clone_destroy(struct ifnet *ifp) { /* XXX: destroying lo0 will lead to panics. */ KASSERT(V_loif != ifp, ("%s: destroying lo0", __func__)); bpfdetach(ifp); if_detach(ifp); if_free(ifp); } static int lo_clone_create(struct if_clone *ifc, int unit, caddr_t params) { INIT_VNET_NET(curvnet); struct ifnet *ifp; ifp = if_alloc(IFT_LOOP); if (ifp == NULL) return (ENOSPC); if_initname(ifp, ifc->ifc_name, unit); ifp->if_mtu = LOMTU; ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST; ifp->if_ioctl = loioctl; ifp->if_output = looutput; ifp->if_snd.ifq_maxlen = ifqmaxlen; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); if (V_loif == NULL) V_loif = ifp; return (0); } static int loop_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: + V_loif = NULL; if_clone_attach(&lo_cloner); break; case MOD_UNLOAD: printf("loop module unload - not possible for this module type\n"); return (EINVAL); default: return (EOPNOTSUPP); } return (0); } static moduledata_t loop_mod = { "loop", loop_modevent, 0 }; DECLARE_MODULE(loop, loop_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); int looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct rtentry *rt) { u_int32_t af; M_ASSERTPKTHDR(m); /* check if we have the packet header */ if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { m_freem(m); return (rt->rt_flags & RTF_BLACKHOLE ? 0 : rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); } ifp->if_opackets++; ifp->if_obytes += m->m_pkthdr.len; /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) { bcopy(dst->sa_data, &af, sizeof(af)); dst->sa_family = af; } #if 1 /* XXX */ switch (dst->sa_family) { case AF_INET: case AF_INET6: case AF_IPX: case AF_APPLETALK: break; default: printf("looutput: af=%d unexpected\n", dst->sa_family); m_freem(m); return (EAFNOSUPPORT); } #endif return (if_simloop(ifp, m, dst->sa_family, 0)); } /* * if_simloop() * * This function is to support software emulation of hardware loopback, * i.e., for interfaces with the IFF_SIMPLEX attribute. Since they can't * hear their own broadcasts, we create a copy of the packet that we * would normally receive via a hardware loopback. * * This function expects the packet to include the media header of length hlen. */ int if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen) { INIT_VNET_NET(ifp->if_vnet); int isr; M_ASSERTPKTHDR(m); m_tag_delete_nonpersistent(m); m->m_pkthdr.rcvif = ifp; /* * Let BPF see incoming packet in the following manner: * - Emulated packet loopback for a simplex interface * (net/if_ethersubr.c) * -> passes it to ifp's BPF * - IPv4/v6 multicast packet loopback (netinet(6)/ip(6)_output.c) * -> not passes it to any BPF * - Normal packet loopback from myself to myself (net/if_loop.c) * -> passes to lo0's BPF (even in case of IPv6, where ifp!=lo0) */ if (hlen > 0) { if (bpf_peers_present(ifp->if_bpf)) { bpf_mtap(ifp->if_bpf, m); } } else { if (bpf_peers_present(V_loif->if_bpf)) { if ((m->m_flags & M_MCAST) == 0 || V_loif == ifp) { /* XXX beware sizeof(af) != 4 */ u_int32_t af1 = af; /* * We need to prepend the address family. */ bpf_mtap2(V_loif->if_bpf, &af1, sizeof(af1), m); } } } /* Strip away media header */ if (hlen > 0) { m_adj(m, hlen); #ifndef __NO_STRICT_ALIGNMENT /* * Some archs do not like unaligned data, so * we move data down in the first mbuf. */ if (mtod(m, vm_offset_t) & 3) { KASSERT(hlen >= 3, ("if_simloop: hlen too small")); bcopy(m->m_data, (char *)(mtod(m, vm_offset_t) - (mtod(m, vm_offset_t) & 3)), m->m_len); m->m_data -= (mtod(m,vm_offset_t) & 3); } #endif } /* Deliver to upper layer protocol */ switch (af) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: m->m_flags |= M_LOOP; isr = NETISR_IPV6; break; #endif #ifdef IPX case AF_IPX: isr = NETISR_IPX; break; #endif #ifdef NETATALK case AF_APPLETALK: isr = NETISR_ATALK2; break; #endif default: printf("if_simloop: can't handle af=%d\n", af); m_freem(m); return (EAFNOSUPPORT); } ifp->if_ipackets++; ifp->if_ibytes += m->m_pkthdr.len; netisr_queue(isr, m); /* mbuf is free'd on failure. */ return (0); } /* ARGSUSED */ static void lortrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) { RT_LOCK_ASSERT(rt); rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; } /* * Process an ioctl request. */ /* ARGSUSED */ int loioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifaddr *ifa; struct ifreq *ifr = (struct ifreq *)data; int error = 0; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; ifa = (struct ifaddr *)data; ifa->ifa_rtrequest = lortrequest; /* * Everything else is done at a higher level. */ break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifr == 0) { error = EAFNOSUPPORT; /* XXX */ break; } switch (ifr->ifr_addr.sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: error = EAFNOSUPPORT; break; } break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; break; case SIOCSIFFLAGS: break; default: error = EINVAL; } return (error); } Index: head/sys/net/raw_cb.c =================================================================== --- head/sys/net/raw_cb.c (revision 185087) +++ head/sys/net/raw_cb.c (revision 185088) @@ -1,118 +1,120 @@ /*- * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_cb.c 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Routines to manage the raw protocol control blocks. * * TODO: * hash lookups by protocol family/protocol + address family * take care of unique address problems per AF? * redo address binding to allow wildcards */ struct mtx rawcb_mtx; +#ifdef VIMAGE_GLOBALS struct rawcb_list_head rawcb_list; +#endif SYSCTL_NODE(_net, OID_AUTO, raw, CTLFLAG_RW, 0, "Raw socket infrastructure"); static u_long raw_sendspace = RAWSNDQ; SYSCTL_ULONG(_net_raw, OID_AUTO, sendspace, CTLFLAG_RW, &raw_sendspace, 0, "Default raw socket send space"); static u_long raw_recvspace = RAWRCVQ; SYSCTL_ULONG(_net_raw, OID_AUTO, recvspace, CTLFLAG_RW, &raw_recvspace, 0, "Default raw socket receive space"); /* * Allocate a control block and a nominal amount of buffer space for the * socket. */ int raw_attach(struct socket *so, int proto) { INIT_VNET_NET(so->so_vnet); struct rawcb *rp = sotorawcb(so); int error; /* * It is assumed that raw_attach is called after space has been * allocated for the rawcb; consumer protocols may simply allocate * type struct rawcb, or a wrapper data structure that begins with a * struct rawcb. */ KASSERT(rp != NULL, ("raw_attach: rp == NULL")); error = soreserve(so, raw_sendspace, raw_recvspace); if (error) return (error); rp->rcb_socket = so; rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family; rp->rcb_proto.sp_protocol = proto; mtx_lock(&rawcb_mtx); LIST_INSERT_HEAD(&V_rawcb_list, rp, list); mtx_unlock(&rawcb_mtx); return (0); } /* * Detach the raw connection block and discard socket resources. */ void raw_detach(struct rawcb *rp) { struct socket *so = rp->rcb_socket; KASSERT(so->so_pcb == rp, ("raw_detach: so_pcb != rp")); so->so_pcb = NULL; mtx_lock(&rawcb_mtx); LIST_REMOVE(rp, list); mtx_unlock(&rawcb_mtx); free((caddr_t)(rp), M_PCB); } Index: head/sys/net/route.c =================================================================== --- head/sys/net/route.c (revision 185087) +++ head/sys/net/route.c (revision 185088) @@ -1,1789 +1,1791 @@ /*- * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.c 8.3.1.1 (Berkeley) 2/23/95 * $FreeBSD$ */ /************************************************************************ * Note: In this file a 'fib' is a "forwarding information base" * * Which is the new name for an in kernel routing (next hop) table. * ***********************************************************************/ #include "opt_inet.h" #include "opt_route.h" #include "opt_mrouting.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include u_int rt_numfibs = RT_NUMFIBS; SYSCTL_INT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, ""); /* * Allow the boot code to allow LESS than RT_MAXFIBS to be used. * We can't do more because storage is statically allocated for now. * (for compatibility reasons.. this will change). */ TUNABLE_INT("net.fibs", &rt_numfibs); /* * By default add routes to all fibs for new interfaces. * Once this is set to 0 then only allocate routes on interface * changes for the FIB of the caller when adding a new set of addresses * to an interface. XXX this is a shotgun aproach to a problem that needs * a more fine grained solution.. that will come. */ u_int rt_add_addr_allfibs = 1; SYSCTL_INT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW, &rt_add_addr_allfibs, 0, ""); TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs); +#ifdef VIMAGE_GLOBALS static struct rtstat rtstat; /* by default only the first 'row' of tables will be accessed. */ /* * XXXMRT When we fix netstat, and do this differnetly, * we can allocate this dynamically. As long as we are keeping * things backwards compaitble we need to allocate this * statically. */ struct radix_node_head *rt_tables[RT_MAXFIBS][AF_MAX+1]; static int rttrash; /* routes not in table but not freed */ +#endif static void rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *); /* compare two sockaddr structures */ #define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0) /* * Convert a 'struct radix_node *' to a 'struct rtentry *'. * The operation can be done safely (in this code) because a * 'struct rtentry' starts with two 'struct radix_node''s, the first * one representing leaf nodes in the routing tree, which is * what the code in radix.c passes us as a 'struct radix_node'. * * But because there are a lot of assumptions in this conversion, * do not cast explicitly, but always use the macro below. */ #define RNTORT(p) ((struct rtentry *)(p)) static uma_zone_t rtzone; /* Routing table UMA zone. */ #if 0 /* default fib for tunnels to use */ u_int tunnel_fib = 0; SYSCTL_INT(_net, OID_AUTO, tunnelfib, CTLFLAG_RD, &tunnel_fib, 0, ""); #endif /* * handler for net.my_fibnum */ static int sysctl_my_fibnum(SYSCTL_HANDLER_ARGS) { int fibnum; int error; fibnum = curthread->td_proc->p_fibnum; error = sysctl_handle_int(oidp, &fibnum, 0, req); return (error); } SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); static void route_init(void) { int table; struct domain *dom; int fam; /* whack the tunable ints into line. */ if (rt_numfibs > RT_MAXFIBS) rt_numfibs = RT_MAXFIBS; if (rt_numfibs == 0) rt_numfibs = 1; rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); rn_init(); /* initialize all zeroes, all ones, mask table */ for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtattach) { for (table = 0; table < rt_numfibs; table++) { if ( (fam = dom->dom_family) == AF_INET || table == 0) { /* for now only AF_INET has > 1 table */ /* XXX MRT * rtattach will be also called * from vfs_export.c but the * offset will be 0 * (only for AF_INET and AF_INET6 * which don't need it anyhow) */ dom->dom_rtattach( (void **)&V_rt_tables[table][fam], dom->dom_rtoffset); } else { break; } } } } } #ifndef _SYS_SYSPROTO_H_ struct setfib_args { int fibnum; }; #endif int setfib(struct thread *td, struct setfib_args *uap) { if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs) return EINVAL; td->td_proc->p_fibnum = uap->fibnum; return (0); } /* * Packet routing routines. */ void rtalloc(struct route *ro) { rtalloc_ign_fib(ro, 0UL, 0); } void rtalloc_fib(struct route *ro, u_int fibnum) { rtalloc_ign_fib(ro, 0UL, fibnum); } void rtalloc_ign(struct route *ro, u_long ignore) { struct rtentry *rt; if ((rt = ro->ro_rt) != NULL) { if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) return; RTFREE(rt); ro->ro_rt = NULL; } ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, 0); if (ro->ro_rt) RT_UNLOCK(ro->ro_rt); } void rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum) { struct rtentry *rt; if ((rt = ro->ro_rt) != NULL) { if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) return; RTFREE(rt); ro->ro_rt = NULL; } ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum); if (ro->ro_rt) RT_UNLOCK(ro->ro_rt); } /* * Look up the route that matches the address given * Or, at least try.. Create a cloned route if needed. * * The returned route, if any, is locked. */ struct rtentry * rtalloc1(struct sockaddr *dst, int report, u_long ignflags) { return (rtalloc1_fib(dst, report, ignflags, 0)); } struct rtentry * rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { INIT_VNET_NET(curvnet); struct radix_node_head *rnh; struct rtentry *rt; struct radix_node *rn; struct rtentry *newrt; struct rt_addrinfo info; u_long nflags; int err = 0, msgtype = RTM_MISS; KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum")); if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */ fibnum = 0; rnh = V_rt_tables[fibnum][dst->sa_family]; newrt = NULL; /* * Look up the address in the table for that Address Family */ if (rnh == NULL) { V_rtstat.rts_unreach++; goto miss2; } RADIX_NODE_HEAD_LOCK(rnh); if ((rn = rnh->rnh_matchaddr(dst, rnh)) && (rn->rn_flags & RNF_ROOT) == 0) { /* * If we find it and it's not the root node, then * get a reference on the rtentry associated. */ newrt = rt = RNTORT(rn); nflags = rt->rt_flags & ~ignflags; if (report && (nflags & RTF_CLONING)) { /* * We are apparently adding (report = 0 in delete). * If it requires that it be cloned, do so. * (This implies it wasn't a HOST route.) */ err = rtrequest_fib(RTM_RESOLVE, dst, NULL, NULL, 0, &newrt, fibnum); if (err) { /* * If the cloning didn't succeed, maybe * what we have will do. Return that. */ newrt = rt; /* existing route */ RT_LOCK(newrt); RT_ADDREF(newrt); goto miss; } KASSERT(newrt, ("no route and no error")); RT_LOCK(newrt); if (newrt->rt_flags & RTF_XRESOLVE) { /* * If the new route specifies it be * externally resolved, then go do that. */ msgtype = RTM_RESOLVE; goto miss; } /* Inform listeners of the new route. */ bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(newrt); info.rti_info[RTAX_NETMASK] = rt_mask(newrt); info.rti_info[RTAX_GATEWAY] = newrt->rt_gateway; if (newrt->rt_ifp != NULL) { info.rti_info[RTAX_IFP] = newrt->rt_ifp->if_addr->ifa_addr; info.rti_info[RTAX_IFA] = newrt->rt_ifa->ifa_addr; } rt_missmsg(RTM_ADD, &info, newrt->rt_flags, 0); } else { RT_LOCK(newrt); RT_ADDREF(newrt); } RADIX_NODE_HEAD_UNLOCK(rnh); } else { /* * Either we hit the root or couldn't find any match, * Which basically means * "caint get there frm here" */ V_rtstat.rts_unreach++; miss: RADIX_NODE_HEAD_UNLOCK(rnh); miss2: if (report) { /* * If required, report the failure to the supervising * Authorities. * For a delete, this is not an error. (report == 0) */ bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; rt_missmsg(msgtype, &info, 0, err); } } if (newrt) RT_LOCK_ASSERT(newrt); return (newrt); } /* * Remove a reference count from an rtentry. * If the count gets low enough, take it out of the routing table */ void rtfree(struct rtentry *rt) { INIT_VNET_NET(curvnet); struct radix_node_head *rnh; KASSERT(rt != NULL,("%s: NULL rt", __func__)); rnh = V_rt_tables[rt->rt_fibnum][rt_key(rt)->sa_family]; KASSERT(rnh != NULL,("%s: NULL rnh", __func__)); RT_LOCK_ASSERT(rt); /* * The callers should use RTFREE_LOCKED() or RTFREE(), so * we should come here exactly with the last reference. */ RT_REMREF(rt); if (rt->rt_refcnt > 0) { printf("%s: %p has %lu refs\n", __func__, rt, rt->rt_refcnt); goto done; } /* * On last reference give the "close method" a chance * to cleanup private state. This also permits (for * IPv4 and IPv6) a chance to decide if the routing table * entry should be purged immediately or at a later time. * When an immediate purge is to happen the close routine * typically calls rtexpunge which clears the RTF_UP flag * on the entry so that the code below reclaims the storage. */ if (rt->rt_refcnt == 0 && rnh->rnh_close) rnh->rnh_close((struct radix_node *)rt, rnh); /* * If we are no longer "up" (and ref == 0) * then we can free the resources associated * with the route. */ if ((rt->rt_flags & RTF_UP) == 0) { if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic("rtfree 2"); /* * the rtentry must have been removed from the routing table * so it is represented in rttrash.. remove that now. */ V_rttrash--; #ifdef DIAGNOSTIC if (rt->rt_refcnt < 0) { printf("rtfree: %p not freed (neg refs)\n", rt); goto done; } #endif /* * release references on items we hold them on.. * e.g other routes and ifaddrs. */ if (rt->rt_ifa) IFAFREE(rt->rt_ifa); rt->rt_parent = NULL; /* NB: no refcnt on parent */ /* * The key is separatly alloc'd so free it (see rt_setgate()). * This also frees the gateway, as they are always malloc'd * together. */ Free(rt_key(rt)); /* * and the rtentry itself of course */ RT_LOCK_DESTROY(rt); uma_zfree(rtzone, rt); return; } done: RT_UNLOCK(rt); } /* * Force a routing table entry to the specified * destination to go through the given gateway. * Normally called as a result of a routing redirect * message from the network layer. */ void rtredirect(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src) { rtredirect_fib(dst, gateway, netmask, flags, src, 0); } void rtredirect_fib(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src, u_int fibnum) { INIT_VNET_NET(curvnet); struct rtentry *rt, *rt0 = NULL; int error = 0; short *stat = NULL; struct rt_addrinfo info; struct ifaddr *ifa; /* verify the gateway is directly reachable */ if ((ifa = ifa_ifwithnet(gateway)) == NULL) { error = ENETUNREACH; goto out; } rt = rtalloc1_fib(dst, 0, 0UL, fibnum); /* NB: rt is locked */ /* * If the redirect isn't from our current router for this dst, * it's either old or wrong. If it redirects us to ourselves, * we have a routing loop, perhaps as a result of an interface * going down recently. */ if (!(flags & RTF_DONE) && rt && (!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa)) error = EINVAL; else if (ifa_ifwithaddr(gateway)) error = EHOSTUNREACH; if (error) goto done; /* * Create a new entry if we just got back a wildcard entry * or the the lookup failed. This is necessary for hosts * which use routing redirects generated by smart gateways * to dynamically build the routing tables. */ if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2)) goto create; /* * Don't listen to the redirect if it's * for a route to an interface. */ if (rt->rt_flags & RTF_GATEWAY) { if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) { /* * Changing from route to net => route to host. * Create new route, rather than smashing route to net. */ create: rt0 = rt; rt = NULL; flags |= RTF_GATEWAY | RTF_DYNAMIC; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_ifa = ifa; info.rti_flags = flags; error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum); if (rt != NULL) { RT_LOCK(rt); EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst); flags = rt->rt_flags; } if (rt0) RTFREE_LOCKED(rt0); stat = &V_rtstat.rts_dynamic; } else { struct rtentry *gwrt; /* * Smash the current notion of the gateway to * this destination. Should check about netmask!!! */ rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED; stat = &V_rtstat.rts_newgateway; /* * add the key and gateway (in one malloc'd chunk). */ rt_setgate(rt, rt_key(rt), gateway); gwrt = rtalloc1(gateway, 1, 0); EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst); RTFREE_LOCKED(gwrt); } } else error = EHOSTUNREACH; done: if (rt) RTFREE_LOCKED(rt); out: if (error) V_rtstat.rts_badredirect++; else if (stat != NULL) (*stat)++; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_info[RTAX_AUTHOR] = src; rt_missmsg(RTM_REDIRECT, &info, flags, error); } int rtioctl(u_long req, caddr_t data) { return (rtioctl_fib(req, data, 0)); } /* * Routing table ioctl interface. */ int rtioctl_fib(u_long req, caddr_t data, u_int fibnum) { /* * If more ioctl commands are added here, make sure the proper * super-user checks are being performed because it is possible for * prison-root to make it this far if raw sockets have been enabled * in jails. */ #ifdef INET /* Multicast goop, grrr... */ return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP; #else /* INET */ return ENXIO; #endif /* INET */ } struct ifaddr * ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway) { return (ifa_ifwithroute_fib(flags, dst, gateway, 0)); } struct ifaddr * ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway, u_int fibnum) { register struct ifaddr *ifa; int not_found = 0; if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, * and the interface is a pt to pt link * we should search for the destination * as our clue to the interface. Otherwise * we can use the local address. */ ifa = NULL; if (flags & RTF_HOST) ifa = ifa_ifwithdstaddr(dst); if (ifa == NULL) ifa = ifa_ifwithaddr(gateway); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ ifa = ifa_ifwithdstaddr(gateway); } if (ifa == NULL) ifa = ifa_ifwithnet(gateway); if (ifa == NULL) { struct rtentry *rt = rtalloc1_fib(gateway, 0, 0UL, fibnum); if (rt == NULL) return (NULL); /* * dismiss a gateway that is reachable only * through the default router */ switch (gateway->sa_family) { case AF_INET: if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) not_found = 1; break; case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr)) not_found = 1; break; default: break; } RT_REMREF(rt); RT_UNLOCK(rt); if (not_found) return (NULL); if ((ifa = rt->rt_ifa) == NULL) return (NULL); } if (ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *oifa = ifa; ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (ifa == NULL) ifa = oifa; } return (ifa); } static walktree_f_t rt_fixdelete; static walktree_f_t rt_fixchange; struct rtfc_arg { struct rtentry *rt0; struct radix_node_head *rnh; }; /* * Do appropriate manipulations of a routing tree given * all the bits of info needed */ int rtrequest(int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt) { return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, 0)); } int rtrequest_fib(int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt, u_int fibnum) { struct rt_addrinfo info; if (dst->sa_len == 0) return(EINVAL); bzero((caddr_t)&info, sizeof(info)); info.rti_flags = flags; info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; return rtrequest1_fib(req, &info, ret_nrt, fibnum); } /* * These (questionable) definitions of apparent local variables apply * to the next two functions. XXXXXX!!! */ #define dst info->rti_info[RTAX_DST] #define gateway info->rti_info[RTAX_GATEWAY] #define netmask info->rti_info[RTAX_NETMASK] #define ifaaddr info->rti_info[RTAX_IFA] #define ifpaddr info->rti_info[RTAX_IFP] #define flags info->rti_flags int rt_getifa(struct rt_addrinfo *info) { return (rt_getifa_fib(info, 0)); } int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) { struct ifaddr *ifa; int error = 0; /* * ifp may be specified by sockaddr_dl * when protocol address is ambiguous. */ if (info->rti_ifp == NULL && ifpaddr != NULL && ifpaddr->sa_family == AF_LINK && (ifa = ifa_ifwithnet(ifpaddr)) != NULL) info->rti_ifp = ifa->ifa_ifp; if (info->rti_ifa == NULL && ifaaddr != NULL) info->rti_ifa = ifa_ifwithaddr(ifaaddr); if (info->rti_ifa == NULL) { struct sockaddr *sa; sa = ifaaddr != NULL ? ifaaddr : (gateway != NULL ? gateway : dst); if (sa != NULL && info->rti_ifp != NULL) info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); else if (dst != NULL && gateway != NULL) info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway, fibnum); else if (sa != NULL) info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa, fibnum); } if ((ifa = info->rti_ifa) != NULL) { if (info->rti_ifp == NULL) info->rti_ifp = ifa->ifa_ifp; } else error = ENETUNREACH; return (error); } /* * Expunges references to a route that's about to be reclaimed. * The route must be locked. */ int rtexpunge(struct rtentry *rt) { INIT_VNET_NET(curvnet); struct radix_node *rn; struct radix_node_head *rnh; struct ifaddr *ifa; int error = 0; RT_LOCK_ASSERT(rt); #if 0 /* * We cannot assume anything about the reference count * because protocols call us in many situations; often * before unwinding references to the table entry. */ KASSERT(rt->rt_refcnt <= 1, ("bogus refcnt %ld", rt->rt_refcnt)); #endif /* * Find the correct routing tree to use for this Address Family */ rnh = V_rt_tables[rt->rt_fibnum][rt_key(rt)->sa_family]; if (rnh == NULL) return (EAFNOSUPPORT); RADIX_NODE_HEAD_LOCK(rnh); /* * Remove the item from the tree; it should be there, * but when callers invoke us blindly it may not (sigh). */ rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh); if (rn == NULL) { error = ESRCH; goto bad; } KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0, ("unexpected flags 0x%x", rn->rn_flags)); KASSERT(rt == RNTORT(rn), ("lookup mismatch, rt %p rn %p", rt, rn)); rt->rt_flags &= ~RTF_UP; /* * Now search what's left of the subtree for any cloned * routes which might have been formed from this node. */ if ((rt->rt_flags & RTF_CLONING) && rt_mask(rt)) rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt), rt_fixdelete, rt); /* * Remove any external references we may have. * This might result in another rtentry being freed if * we held its last reference. */ if (rt->rt_gwroute) { RTFREE(rt->rt_gwroute); rt->rt_gwroute = NULL; } /* * Give the protocol a chance to keep things in sync. */ if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) { struct rt_addrinfo info; bzero((caddr_t)&info, sizeof(info)); info.rti_flags = rt->rt_flags; info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); ifa->ifa_rtrequest(RTM_DELETE, rt, &info); } /* * one more rtentry floating around that is not * linked to the routing table. */ V_rttrash++; bad: RADIX_NODE_HEAD_UNLOCK(rnh); return (error); } int rtrequest1(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt) { return (rtrequest1_fib(req, info, ret_nrt, 0)); } int rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { INIT_VNET_NET(curvnet); int error = 0; register struct rtentry *rt; register struct radix_node *rn; register struct radix_node_head *rnh; struct ifaddr *ifa; struct sockaddr *ndst; #define senderr(x) { error = x ; goto bad; } KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */ fibnum = 0; /* * Find the correct routing tree to use for this Address Family */ rnh = V_rt_tables[fibnum][dst->sa_family]; if (rnh == NULL) return (EAFNOSUPPORT); RADIX_NODE_HEAD_LOCK(rnh); /* * If we are adding a host route then we don't want to put * a netmask in the tree, nor do we want to clone it. */ if (flags & RTF_HOST) { netmask = NULL; flags &= ~RTF_CLONING; } switch (req) { case RTM_DELETE: #ifdef RADIX_MPATH /* * if we got multipath routes, we require users to specify * a matching RTAX_GATEWAY. */ if (rn_mpath_capable(rnh)) { struct rtentry *rto = NULL; rn = rnh->rnh_matchaddr(dst, rnh); if (rn == NULL) senderr(ESRCH); rto = rt = RNTORT(rn); rt = rt_mpath_matchgate(rt, gateway); if (!rt) senderr(ESRCH); /* * this is the first entry in the chain */ if (rto == rt) { rn = rn_mpath_next((struct radix_node *)rt); /* * there is another entry, now it's active */ if (rn) { rto = RNTORT(rn); RT_LOCK(rto); rto->rt_flags |= RTF_UP; RT_UNLOCK(rto); } else if (rt->rt_flags & RTF_GATEWAY) { /* * For gateway routes, we need to * make sure that we we are deleting * the correct gateway. * rt_mpath_matchgate() does not * check the case when there is only * one route in the chain. */ if (gateway && (rt->rt_gateway->sa_len != gateway->sa_len || memcmp(rt->rt_gateway, gateway, gateway->sa_len))) senderr(ESRCH); } /* * use the normal delete code to remove * the first entry */ goto normal_rtdel; } /* * if the entry is 2nd and on up */ if (!rt_mpath_deldup(rto, rt)) panic ("rtrequest1: rt_mpath_deldup"); RT_LOCK(rt); RT_ADDREF(rt); rt->rt_flags &= ~RTF_UP; goto deldone; /* done with the RTM_DELETE command */ } normal_rtdel: #endif /* * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ rn = rnh->rnh_deladdr(dst, netmask, rnh); if (rn == NULL) senderr(ESRCH); if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic ("rtrequest delete"); rt = RNTORT(rn); RT_LOCK(rt); RT_ADDREF(rt); rt->rt_flags &= ~RTF_UP; /* * Now search what's left of the subtree for any cloned * routes which might have been formed from this node. */ if ((rt->rt_flags & RTF_CLONING) && rt_mask(rt)) { rnh->rnh_walktree_from(rnh, dst, rt_mask(rt), rt_fixdelete, rt); } /* * Remove any external references we may have. * This might result in another rtentry being freed if * we held its last reference. */ if (rt->rt_gwroute) { RTFREE(rt->rt_gwroute); rt->rt_gwroute = NULL; } /* * give the protocol a chance to keep things in sync. */ if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) ifa->ifa_rtrequest(RTM_DELETE, rt, info); #ifdef RADIX_MPATH deldone: #endif /* * One more rtentry floating around that is not * linked to the routing table. rttrash will be decremented * when RTFREE(rt) is eventually called. */ V_rttrash++; /* * If the caller wants it, then it can have it, * but it's up to it to free the rtentry as we won't be * doing it. */ if (ret_nrt) { *ret_nrt = rt; RT_UNLOCK(rt); } else RTFREE_LOCKED(rt); break; case RTM_RESOLVE: if (ret_nrt == NULL || (rt = *ret_nrt) == NULL) senderr(EINVAL); ifa = rt->rt_ifa; /* XXX locking? */ flags = rt->rt_flags & ~(RTF_CLONING | RTF_STATIC); flags |= RTF_WASCLONED; gateway = rt->rt_gateway; if ((netmask = rt->rt_genmask) == NULL) flags |= RTF_HOST; goto makeroute; case RTM_ADD: if ((flags & RTF_GATEWAY) && !gateway) senderr(EINVAL); if (dst && gateway && (dst->sa_family != gateway->sa_family) && (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) senderr(EINVAL); if (info->rti_ifa == NULL && (error = rt_getifa_fib(info, fibnum))) senderr(error); ifa = info->rti_ifa; makeroute: rt = uma_zalloc(rtzone, M_NOWAIT | M_ZERO); if (rt == NULL) senderr(ENOBUFS); RT_LOCK_INIT(rt); rt->rt_flags = RTF_UP | flags; rt->rt_fibnum = fibnum; /* * Add the gateway. Possibly re-malloc-ing the storage for it * also add the rt_gwroute if possible. */ RT_LOCK(rt); if ((error = rt_setgate(rt, dst, gateway)) != 0) { RT_LOCK_DESTROY(rt); uma_zfree(rtzone, rt); senderr(error); } /* * point to the (possibly newly malloc'd) dest address. */ ndst = (struct sockaddr *)rt_key(rt); /* * make sure it contains the value we want (masked if needed). */ if (netmask) { rt_maskedcopy(dst, ndst, netmask); } else bcopy(dst, ndst, dst->sa_len); /* * Note that we now have a reference to the ifa. * This moved from below so that rnh->rnh_addaddr() can * examine the ifa and ifa->ifa_ifp if it so desires. */ IFAREF(ifa); rt->rt_ifa = ifa; rt->rt_ifp = ifa->ifa_ifp; #ifdef RADIX_MPATH /* do not permit exactly the same dst/mask/gw pair */ if (rn_mpath_capable(rnh) && rt_mpath_conflict(rnh, rt, netmask)) { if (rt->rt_gwroute) RTFREE(rt->rt_gwroute); if (rt->rt_ifa) { IFAFREE(rt->rt_ifa); } Free(rt_key(rt)); RT_LOCK_DESTROY(rt); uma_zfree(rtzone, rt); senderr(EEXIST); } #endif /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes); if (rn == NULL) { struct rtentry *rt2; /* * Uh-oh, we already have one of these in the tree. * We do a special hack: if the route that's already * there was generated by the cloning mechanism * then we just blow it away and retry the insertion * of the new one. */ rt2 = rtalloc1_fib(dst, 0, 0, fibnum); if (rt2 && rt2->rt_parent) { rtexpunge(rt2); RT_UNLOCK(rt2); rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes); } else if (rt2) { /* undo the extra ref we got */ RTFREE_LOCKED(rt2); } } /* * If it still failed to go into the tree, * then un-make it (this should be a function) */ if (rn == NULL) { if (rt->rt_gwroute) RTFREE(rt->rt_gwroute); if (rt->rt_ifa) IFAFREE(rt->rt_ifa); Free(rt_key(rt)); RT_LOCK_DESTROY(rt); uma_zfree(rtzone, rt); senderr(EEXIST); } rt->rt_parent = NULL; /* * If we got here from RESOLVE, then we are cloning * so clone the rest, and note that we * are a clone (and increment the parent's references) */ if (req == RTM_RESOLVE) { KASSERT(ret_nrt && *ret_nrt, ("no route to clone from")); rt->rt_rmx = (*ret_nrt)->rt_rmx; /* copy metrics */ rt->rt_rmx.rmx_pksent = 0; /* reset packet counter */ if ((*ret_nrt)->rt_flags & RTF_CLONING) { /* * NB: We do not bump the refcnt on the parent * entry under the assumption that it will * remain so long as we do. This is * important when deleting the parent route * as this operation requires traversing * the tree to delete all clones and futzing * with refcnts requires us to double-lock * parent through this back reference. */ rt->rt_parent = *ret_nrt; } } /* * If this protocol has something to add to this then * allow it to do that as well. */ if (ifa->ifa_rtrequest) ifa->ifa_rtrequest(req, rt, info); /* * We repeat the same procedure from rt_setgate() here because * it doesn't fire when we call it there because the node * hasn't been added to the tree yet. */ if (req == RTM_ADD && !(rt->rt_flags & RTF_HOST) && rt_mask(rt) != NULL) { struct rtfc_arg arg; arg.rnh = rnh; arg.rt0 = rt; rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt), rt_fixchange, &arg); } /* * actually return a resultant rtentry and * give the caller a single reference. */ if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } RT_UNLOCK(rt); break; default: error = EOPNOTSUPP; } bad: RADIX_NODE_HEAD_UNLOCK(rnh); return (error); #undef senderr } #undef dst #undef gateway #undef netmask #undef ifaaddr #undef ifpaddr #undef flags /* * Called from rtrequest(RTM_DELETE, ...) to fix up the route's ``family'' * (i.e., the routes related to it by the operation of cloning). This * routine is iterated over all potential former-child-routes by way of * rnh->rnh_walktree_from() above, and those that actually are children of * the late parent (passed in as VP here) are themselves deleted. */ static int rt_fixdelete(struct radix_node *rn, void *vp) { struct rtentry *rt = RNTORT(rn); struct rtentry *rt0 = vp; if (rt->rt_parent == rt0 && !(rt->rt_flags & (RTF_PINNED | RTF_CLONING))) { return rtrequest_fib(RTM_DELETE, rt_key(rt), NULL, rt_mask(rt), rt->rt_flags, NULL, rt->rt_fibnum); } return 0; } /* * This routine is called from rt_setgate() to do the analogous thing for * adds and changes. There is the added complication in this case of a * middle insert; i.e., insertion of a new network route between an older * network route and (cloned) host routes. For this reason, a simple check * of rt->rt_parent is insufficient; each candidate route must be tested * against the (mask, value) of the new route (passed as before in vp) * to see if the new route matches it. * * XXX - it may be possible to do fixdelete() for changes and reserve this * routine just for adds. I'm not sure why I thought it was necessary to do * changes this way. */ static int rt_fixchange(struct radix_node *rn, void *vp) { struct rtentry *rt = RNTORT(rn); struct rtfc_arg *ap = vp; struct rtentry *rt0 = ap->rt0; struct radix_node_head *rnh = ap->rnh; u_char *xk1, *xm1, *xk2, *xmp; int i, len, mlen; /* make sure we have a parent, and route is not pinned or cloning */ if (!rt->rt_parent || (rt->rt_flags & (RTF_PINNED | RTF_CLONING))) return 0; if (rt->rt_parent == rt0) /* parent match */ goto delete_rt; /* * There probably is a function somewhere which does this... * if not, there should be. */ len = imin(rt_key(rt0)->sa_len, rt_key(rt)->sa_len); xk1 = (u_char *)rt_key(rt0); xm1 = (u_char *)rt_mask(rt0); xk2 = (u_char *)rt_key(rt); /* avoid applying a less specific route */ xmp = (u_char *)rt_mask(rt->rt_parent); mlen = rt_key(rt->rt_parent)->sa_len; if (mlen > rt_key(rt0)->sa_len) /* less specific route */ return 0; for (i = rnh->rnh_treetop->rn_offset; i < mlen; i++) if ((xmp[i] & ~(xmp[i] ^ xm1[i])) != xmp[i]) return 0; /* less specific route */ for (i = rnh->rnh_treetop->rn_offset; i < len; i++) if ((xk2[i] & xm1[i]) != xk1[i]) return 0; /* no match */ /* * OK, this node is a clone, and matches the node currently being * changed/added under the node's mask. So, get rid of it. */ delete_rt: return rtrequest_fib(RTM_DELETE, rt_key(rt), NULL, rt_mask(rt), rt->rt_flags, NULL, rt->rt_fibnum); } int rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { INIT_VNET_NET(curvnet); /* XXX dst may be overwritten, can we move this to below */ struct radix_node_head *rnh = V_rt_tables[rt->rt_fibnum][dst->sa_family]; int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); again: RT_LOCK_ASSERT(rt); /* * A host route with the destination equal to the gateway * will interfere with keeping LLINFO in the routing * table, so disallow it. */ if (((rt->rt_flags & (RTF_HOST|RTF_GATEWAY|RTF_LLINFO)) == (RTF_HOST|RTF_GATEWAY)) && dst->sa_len == gate->sa_len && bcmp(dst, gate, dst->sa_len) == 0) { /* * The route might already exist if this is an RTM_CHANGE * or a routing redirect, so try to delete it. */ if (rt_key(rt)) rtexpunge(rt); return EADDRNOTAVAIL; } /* * Cloning loop avoidance in case of bad configuration. */ if (rt->rt_flags & RTF_GATEWAY) { struct rtentry *gwrt; RT_UNLOCK(rt); /* XXX workaround LOR */ gwrt = rtalloc1_fib(gate, 1, 0, rt->rt_fibnum); if (gwrt == rt) { RT_REMREF(rt); return (EADDRINUSE); /* failure */ } /* * Try to reacquire the lock on rt, and if it fails, * clean state and restart from scratch. */ if (!RT_TRYLOCK(rt)) { RTFREE_LOCKED(gwrt); RT_LOCK(rt); goto again; } /* * If there is already a gwroute, then drop it. If we * are asked to replace route with itself, then do * not leak its refcounter. */ if (rt->rt_gwroute != NULL) { if (rt->rt_gwroute == gwrt) { RT_REMREF(rt->rt_gwroute); } else RTFREE(rt->rt_gwroute); } if ((rt->rt_gwroute = gwrt) != NULL) RT_UNLOCK(rt->rt_gwroute); } /* * Prepare to store the gateway in rt->rt_gateway. * Both dst and gateway are stored one after the other in the same * malloc'd chunk. If we have room, we can reuse the old buffer, * rt_gateway already points to the right place. * Otherwise, malloc a new block and update the 'dst' address. */ if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) { caddr_t new; R_Malloc(new, caddr_t, dlen + glen); if (new == NULL) return ENOBUFS; /* * XXX note, we copy from *dst and not *rt_key(rt) because * rt_setgate() can be called to initialize a newly * allocated route entry, in which case rt_key(rt) == NULL * (and also rt->rt_gateway == NULL). * Free()/free() handle a NULL argument just fine. */ bcopy(dst, new, dlen); Free(rt_key(rt)); /* free old block, if any */ rt_key(rt) = (struct sockaddr *)new; rt->rt_gateway = (struct sockaddr *)(new + dlen); } /* * Copy the new gateway value into the memory chunk. */ bcopy(gate, rt->rt_gateway, glen); /* * This isn't going to do anything useful for host routes, so * don't bother. Also make sure we have a reasonable mask * (we don't yet have one during adds). */ if (!(rt->rt_flags & RTF_HOST) && rt_mask(rt) != 0) { struct rtfc_arg arg; arg.rnh = rnh; arg.rt0 = rt; RT_UNLOCK(rt); /* XXX workaround LOR */ RADIX_NODE_HEAD_LOCK(rnh); RT_LOCK(rt); rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt), rt_fixchange, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); } return 0; } static void rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) { register u_char *cp1 = (u_char *)src; register u_char *cp2 = (u_char *)dst; register u_char *cp3 = (u_char *)netmask; u_char *cplim = cp2 + *cp3; u_char *cplim2 = cp2 + *cp1; *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */ cp3 += 2; if (cplim > cplim2) cplim = cplim2; while (cp2 < cplim) *cp2++ = *cp1++ & *cp3++; if (cp2 < cplim2) bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); } /* * Set up a routing table entry, normally * for an interface. */ #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */ static inline int rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) { INIT_VNET_NET(curvnet); struct sockaddr *dst; struct sockaddr *netmask; struct rtentry *rt = NULL; struct rt_addrinfo info; int error = 0; int startfib, endfib; char tempbuf[_SOCKADDR_TMPSIZE]; int didwork = 0; int a_failure = 0; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; netmask = NULL; } else { dst = ifa->ifa_addr; netmask = ifa->ifa_netmask; } if ( dst->sa_family != AF_INET) fibnum = 0; if (fibnum == -1) { if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) { startfib = endfib = curthread->td_proc->p_fibnum; } else { startfib = 0; endfib = rt_numfibs - 1; } } else { KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum")); startfib = fibnum; endfib = fibnum; } if (dst->sa_len == 0) return(EINVAL); /* * If it's a delete, check that if it exists, * it's on the correct interface or we might scrub * a route to another ifa which would * be confusing at best and possibly worse. */ if (cmd == RTM_DELETE) { /* * It's a delete, so it should already exist.. * If it's a net, mask off the host bits * (Assuming we have a mask) * XXX this is kinda inet specific.. */ if (netmask != NULL) { rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask); dst = (struct sockaddr *)tempbuf; } } /* * Now go through all the requested tables (fibs) and do the * requested action. Realistically, this will either be fib 0 * for protocols that don't do multiple tables or all the * tables for those that do. XXX For this version only AF_INET. * When that changes code should be refactored to protocol * independent parts and protocol dependent parts. */ for ( fibnum = startfib; fibnum <= endfib; fibnum++) { if (cmd == RTM_DELETE) { struct radix_node_head *rnh; struct radix_node *rn; /* * Look up an rtentry that is in the routing tree and * contains the correct info. */ if ((rnh = V_rt_tables[fibnum][dst->sa_family]) == NULL) /* this table doesn't exist but others might */ continue; RADIX_NODE_HEAD_LOCK(rnh); #ifdef RADIX_MPATH if (rn_mpath_capable(rnh)) { rn = rnh->rnh_matchaddr(dst, rnh); if (rn == NULL) error = ESRCH; else { rt = RNTORT(rn); /* * for interface route the * rt->rt_gateway is sockaddr_intf * for cloning ARP entries, so * rt_mpath_matchgate must use the * interface address */ rt = rt_mpath_matchgate(rt, ifa->ifa_addr); if (!rt) error = ESRCH; } } else #endif rn = rnh->rnh_lookup(dst, netmask, rnh); error = (rn == NULL || (rn->rn_flags & RNF_ROOT) || RNTORT(rn)->rt_ifa != ifa || !sa_equal((struct sockaddr *)rn->rn_key, dst)); RADIX_NODE_HEAD_UNLOCK(rnh); if (error) { /* this is only an error if bad on ALL tables */ continue; } } /* * Do the actual request */ bzero((caddr_t)&info, sizeof(info)); info.rti_ifa = ifa; info.rti_flags = flags | ifa->ifa_flags; info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = netmask; error = rtrequest1_fib(cmd, &info, &rt, fibnum); if (error == 0 && rt != NULL) { /* * notify any listening routing agents of the change */ RT_LOCK(rt); #ifdef RADIX_MPATH /* * in case address alias finds the first address * e.g. ifconfig bge0 192.103.54.246/24 * e.g. ifconfig bge0 192.103.54.247/24 * the address set in the route is 192.103.54.246 * so we need to replace it with 192.103.54.247 */ if (memcmp(rt->rt_ifa->ifa_addr, ifa->ifa_addr, ifa->ifa_addr->sa_len)) { IFAFREE(rt->rt_ifa); IFAREF(ifa); rt->rt_ifp = ifa->ifa_ifp; rt->rt_ifa = ifa; } #endif rt_newaddrmsg(cmd, ifa, error, rt); if (cmd == RTM_DELETE) { /* * If we are deleting, and we found an entry, * then it's been removed from the tree.. * now throw it away. */ RTFREE_LOCKED(rt); } else { if (cmd == RTM_ADD) { /* * We just wanted to add it.. * we don't actually need a reference. */ RT_REMREF(rt); } RT_UNLOCK(rt); } didwork = 1; } if (error) a_failure = error; } if (cmd == RTM_DELETE) { if (didwork) { error = 0; } else { /* we only give an error if it wasn't in any table */ error = ((flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); } } else { if (a_failure) { /* return an error if any of them failed */ error = a_failure; } } return (error); } /* special one for inet internal use. may not use. */ int rtinit_fib(struct ifaddr *ifa, int cmd, int flags) { return (rtinit1(ifa, cmd, flags, -1)); } /* * Set up a routing table entry, normally * for an interface. */ int rtinit(struct ifaddr *ifa, int cmd, int flags) { struct sockaddr *dst; int fib = 0; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; } else { dst = ifa->ifa_addr; } if (dst->sa_family == AF_INET) fib = -1; return (rtinit1(ifa, cmd, flags, fib)); } /* * rt_check() is invoked on each layer 2 output path, prior to * encapsulating outbound packets. * * The function is mostly used to find a routing entry for the gateway, * which in some protocol families could also point to the link-level * address for the gateway itself (the side effect of revalidating the * route to the destination is rather pointless at this stage, we did it * already a moment before in the pr_output() routine to locate the ifp * and gateway to use). * * When we remove the layer-3 to layer-2 mapping tables from the * routing table, this function can be removed. * * === On input === * *dst is the address of the NEXT HOP (which coincides with the * final destination if directly reachable); * *lrt0 points to the cached route to the final destination; * *lrt is not meaningful; * (*lrt0 has no ref held on it by us so REMREF is not needed. * Refs only account for major structural references and not usages, * which is actually a bit of a problem.) * * === Operation === * If the route is marked down try to find a new route. If the route * to the gateway is gone, try to setup a new route. Otherwise, * if the route is marked for packets to be rejected, enforce that. * Note that rtalloc returns an rtentry with an extra REF that we may * need to lose. * * === On return === * *dst is unchanged; * *lrt0 points to the (possibly new) route to the final destination * *lrt points to the route to the next hop [LOCKED] * * Their values are meaningful ONLY if no error is returned. * * To follow this you have to remember that: * RT_REMREF reduces the reference count by 1 but doesn't check it for 0 (!) * RTFREE_LOCKED includes an RT_REMREF (or an rtfree if refs == 1) * and an RT_UNLOCK * RTFREE does an RT_LOCK and an RTFREE_LOCKED * The gwroute pointer counts as a reference on the rtentry to which it points. * so when we add it we use the ref that rtalloc gives us and when we lose it * we need to remove the reference. * RT_TEMP_UNLOCK does an RT_ADDREF before freeing the lock, and * RT_RELOCK locks it (it can't have gone away due to the ref) and * drops the ref, possibly freeing it and zeroing the pointer if * the ref goes to 0 (unlocking in the process). */ int rt_check(struct rtentry **lrt, struct rtentry **lrt0, struct sockaddr *dst) { struct rtentry *rt; struct rtentry *rt0; u_int fibnum; KASSERT(*lrt0 != NULL, ("rt_check")); rt0 = *lrt0; rt = NULL; fibnum = rt0->rt_fibnum; /* NB: the locking here is tortuous... */ RT_LOCK(rt0); retry: if (rt0 && (rt0->rt_flags & RTF_UP) == 0) { /* Current rt0 is useless, try get a replacement. */ RT_UNLOCK(rt0); rt0 = NULL; } if (rt0 == NULL) { rt0 = rtalloc1_fib(dst, 1, 0UL, fibnum); if (rt0 == NULL) { return (EHOSTUNREACH); } RT_REMREF(rt0); /* don't need the reference. */ } if (rt0->rt_flags & RTF_GATEWAY) { if ((rt = rt0->rt_gwroute) != NULL) { RT_LOCK(rt); /* NB: gwroute */ if ((rt->rt_flags & RTF_UP) == 0) { /* gw route is dud. ignore/lose it */ RTFREE_LOCKED(rt); /* unref (&unlock) gwroute */ rt = rt0->rt_gwroute = NULL; } } if (rt == NULL) { /* NOT AN ELSE CLAUSE */ RT_TEMP_UNLOCK(rt0); /* MUST return to undo this */ rt = rtalloc1_fib(rt0->rt_gateway, 1, 0UL, fibnum); if ((rt == rt0) || (rt == NULL)) { /* the best we can do is not good enough */ if (rt) { RT_REMREF(rt); /* assumes ref > 0 */ RT_UNLOCK(rt); } RTFREE(rt0); /* lock, unref, (unlock) */ return (ENETUNREACH); } /* * Relock it and lose the added reference. * All sorts of things could have happenned while we * had no lock on it, so check for them. */ RT_RELOCK(rt0); if (rt0 == NULL || ((rt0->rt_flags & RTF_UP) == 0)) /* Ru-roh.. what we had is no longer any good */ goto retry; /* * While we were away, someone replaced the gateway. * Since a reference count is involved we can't just * overwrite it. */ if (rt0->rt_gwroute) { if (rt0->rt_gwroute != rt) { RTFREE_LOCKED(rt); goto retry; } } else { rt0->rt_gwroute = rt; } } RT_LOCK_ASSERT(rt); RT_UNLOCK(rt0); } else { /* think of rt as having the lock from now on.. */ rt = rt0; } /* XXX why are we inspecting rmx_expire? */ if ((rt->rt_flags & RTF_REJECT) && (rt->rt_rmx.rmx_expire == 0 || time_uptime < rt->rt_rmx.rmx_expire)) { RT_UNLOCK(rt); return (rt == rt0 ? EHOSTDOWN : EHOSTUNREACH); } *lrt = rt; *lrt0 = rt0; return (0); } /* This must be before ip6_init2(), which is now SI_ORDER_MIDDLE */ SYSINIT(route, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0); Index: head/sys/netinet/if_ether.c =================================================================== --- head/sys/netinet/if_ether.c (revision 185087) +++ head/sys/netinet/if_ether.c (revision 185088) @@ -1,1084 +1,1091 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ether.c 8.1 (Berkeley) 6/10/93 */ /* * Ethernet address resolution protocol. * TODO: * add "inuse/lock" bit (or ref. count) along with valid bit */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_mac.h" #include "opt_carp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_CARP #include #endif #include #define SIN(s) ((struct sockaddr_in *)s) #define SDL(s) ((struct sockaddr_dl *)s) SYSCTL_DECL(_net_link_ether); SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); /* timer values */ -static int arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ +#ifdef VIMAGE_GLOBALS +static int arpt_keep; /* once resolved, good for 20 more minutes */ +static int arp_maxtries; +static int useloopback; /* use loopback interface for local traffic */ +static int arp_proxyall; +#endif SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, &arpt_keep, 0, "ARP entry lifetime in seconds"); #define rt_expire rt_rmx.rmx_expire struct llinfo_arp { struct callout la_timer; struct rtentry *la_rt; struct mbuf *la_hold; /* last packet until resolved/timeout */ u_short la_preempt; /* countdown for pre-expiry arps */ u_short la_asked; /* # requests sent */ }; static struct ifqueue arpintrq; -static int arp_maxtries = 5; -static int useloopback = 1; /* use loopback interface for local traffic */ -static int arp_proxyall = 0; - SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, arp_maxtries, 0, "ARP resolution attempts before returning error"); SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW, useloopback, 0, "Use the loopback interface for local traffic"); SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, arp_proxyall, 0, "Enable proxy ARP for all suitable requests"); static void arp_init(void); static void arp_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static void arprequest(struct ifnet *, struct in_addr *, struct in_addr *, u_char *); static void arpintr(struct mbuf *); static void arptimer(void *); static struct rtentry *arplookup(u_long, int, int, int); #ifdef INET static void in_arpinput(struct mbuf *); #endif /* * Timeout routine. */ static void arptimer(void *arg) { struct rtentry *rt = (struct rtentry *)arg; RT_LOCK_ASSERT(rt); /* * The lock is needed to close a theoretical race * between spontaneous expiry and intentional removal. * We still got an extra reference on rtentry, so can * safely pass pointers to its contents. */ RT_UNLOCK(rt); in_rtrequest(RTM_DELETE, rt_key(rt), NULL, rt_mask(rt), 0, NULL, rt->rt_fibnum); } /* * Parallel to llc_rtrequest. */ static void arp_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info) { INIT_VNET_NET(curvnet); INIT_VNET_INET(curvnet); struct sockaddr *gate; struct llinfo_arp *la; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; struct in_ifaddr *ia; struct ifaddr *ifa; RT_LOCK_ASSERT(rt); if (rt->rt_flags & RTF_GATEWAY) return; gate = rt->rt_gateway; la = (struct llinfo_arp *)rt->rt_llinfo; switch (req) { case RTM_ADD: /* * XXX: If this is a manually added route to interface * such as older version of routed or gated might provide, * restore cloning bit. */ if ((rt->rt_flags & RTF_HOST) == 0 && rt_mask(rt) != NULL && SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff) rt->rt_flags |= RTF_CLONING; if (rt->rt_flags & RTF_CLONING) { /* * Case 1: This route should come from a route to iface. */ rt_setgate(rt, rt_key(rt), (struct sockaddr *)&null_sdl); gate = rt->rt_gateway; SDL(gate)->sdl_type = rt->rt_ifp->if_type; SDL(gate)->sdl_index = rt->rt_ifp->if_index; rt->rt_expire = time_uptime; break; } /* Announce a new entry if requested. */ if (rt->rt_flags & RTF_ANNOUNCE) arprequest(rt->rt_ifp, &SIN(rt_key(rt))->sin_addr, &SIN(rt_key(rt))->sin_addr, (u_char *)LLADDR(SDL(gate))); /*FALLTHROUGH*/ case RTM_RESOLVE: if (gate->sa_family != AF_LINK || gate->sa_len < sizeof(null_sdl)) { log(LOG_DEBUG, "%s: bad gateway %s%s\n", __func__, inet_ntoa(SIN(rt_key(rt))->sin_addr), (gate->sa_family != AF_LINK) ? " (!AF_LINK)": ""); break; } SDL(gate)->sdl_type = rt->rt_ifp->if_type; SDL(gate)->sdl_index = rt->rt_ifp->if_index; if (la != 0) break; /* This happens on a route change */ /* * Case 2: This route may come from cloning, or a manual route * add with a LL address. */ R_Zalloc(la, struct llinfo_arp *, sizeof(*la)); rt->rt_llinfo = (caddr_t)la; if (la == 0) { log(LOG_DEBUG, "%s: malloc failed\n", __func__); break; } /* * We are storing a route entry outside of radix tree. So, * it can be found and accessed by other means than radix * lookup. The routing code assumes that any rtentry detached * from radix can be destroyed safely. To prevent this, we * add an additional reference. */ RT_ADDREF(rt); la->la_rt = rt; rt->rt_flags |= RTF_LLINFO; callout_init_mtx(&la->la_timer, &rt->rt_mtx, CALLOUT_RETURNUNLOCKED); #ifdef INET /* * This keeps the multicast addresses from showing up * in `arp -a' listings as unresolved. It's not actually * functional. Then the same for broadcast. */ if (IN_MULTICAST(ntohl(SIN(rt_key(rt))->sin_addr.s_addr)) && rt->rt_ifp->if_type != IFT_ARCNET) { ETHER_MAP_IP_MULTICAST(&SIN(rt_key(rt))->sin_addr, LLADDR(SDL(gate))); SDL(gate)->sdl_alen = 6; rt->rt_expire = 0; } if (in_broadcast(SIN(rt_key(rt))->sin_addr, rt->rt_ifp)) { memcpy(LLADDR(SDL(gate)), rt->rt_ifp->if_broadcastaddr, rt->rt_ifp->if_addrlen); SDL(gate)->sdl_alen = rt->rt_ifp->if_addrlen; rt->rt_expire = 0; } #endif TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifp == rt->rt_ifp && SIN(rt_key(rt))->sin_addr.s_addr == (IA_SIN(ia))->sin_addr.s_addr) break; } if (ia) { /* * This test used to be * if (loif.if_flags & IFF_UP) * It allowed local traffic to be forced * through the hardware by configuring the loopback down. * However, it causes problems during network configuration * for boards that can't receive packets they send. * It is now necessary to clear "useloopback" and remove * the route to force traffic out to the hardware. */ rt->rt_expire = 0; bcopy(IF_LLADDR(rt->rt_ifp), LLADDR(SDL(gate)), SDL(gate)->sdl_alen = rt->rt_ifp->if_addrlen); if (V_useloopback) { rt->rt_ifp = V_loif; rt->rt_rmx.rmx_mtu = V_loif->if_mtu; } /* * make sure to set rt->rt_ifa to the interface * address we are using, otherwise we will have trouble * with source address selection. */ ifa = &ia->ia_ifa; if (ifa != rt->rt_ifa) { IFAFREE(rt->rt_ifa); IFAREF(ifa); rt->rt_ifa = ifa; } } break; case RTM_DELETE: if (la == NULL) /* XXX: at least CARP does this. */ break; callout_stop(&la->la_timer); rt->rt_llinfo = NULL; rt->rt_flags &= ~RTF_LLINFO; RT_REMREF(rt); if (la->la_hold) m_freem(la->la_hold); Free((caddr_t)la); } } /* * Broadcast an ARP request. Caller specifies: * - arp header source ip address * - arp header target ip address * - arp header source ethernet address */ static void arprequest(struct ifnet *ifp, struct in_addr *sip, struct in_addr *tip, u_char *enaddr) { struct mbuf *m; struct arphdr *ah; struct sockaddr sa; if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL) return; m->m_len = sizeof(*ah) + 2*sizeof(struct in_addr) + 2*ifp->if_data.ifi_addrlen; m->m_pkthdr.len = m->m_len; MH_ALIGN(m, m->m_len); ah = mtod(m, struct arphdr *); bzero((caddr_t)ah, m->m_len); #ifdef MAC mac_netinet_arp_send(ifp, m); #endif ah->ar_pro = htons(ETHERTYPE_IP); ah->ar_hln = ifp->if_addrlen; /* hardware address length */ ah->ar_pln = sizeof(struct in_addr); /* protocol address length */ ah->ar_op = htons(ARPOP_REQUEST); bcopy((caddr_t)enaddr, (caddr_t)ar_sha(ah), ah->ar_hln); bcopy((caddr_t)sip, (caddr_t)ar_spa(ah), ah->ar_pln); bcopy((caddr_t)tip, (caddr_t)ar_tpa(ah), ah->ar_pln); sa.sa_family = AF_ARP; sa.sa_len = 2; m->m_flags |= M_BCAST; (*ifp->if_output)(ifp, m, &sa, (struct rtentry *)0); return; } /* * Resolve an IP address into an ethernet address. * On input: * ifp is the interface we use * rt0 is the route to the final destination (possibly useless) * m is the mbuf. May be NULL if we don't have a packet. * dst is the next hop, * desten is where we want the address. * * On success, desten is filled in and the function returns 0; * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. */ int arpresolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m, struct sockaddr *dst, u_char *desten) { INIT_VNET_INET(ifp->if_vnet); struct llinfo_arp *la = NULL; struct rtentry *rt = NULL; struct sockaddr_dl *sdl; int error; int fibnum = -1; if (m) { if (m->m_flags & M_BCAST) { /* broadcast */ (void)memcpy(desten, ifp->if_broadcastaddr, ifp->if_addrlen); return (0); } if (m->m_flags & M_MCAST && ifp->if_type != IFT_ARCNET) { /* multicast */ ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); return (0); } fibnum = M_GETFIB(m); } if (rt0 != NULL) { /* Look for a cached arp (ll) entry. */ if (m == NULL) fibnum = rt0->rt_fibnum; error = rt_check(&rt, &rt0, dst); if (error) { m_freem(m); return error; } la = (struct llinfo_arp *)rt->rt_llinfo; if (la == NULL) RT_UNLOCK(rt); } /* * If we had no mbuf and no route, then hope the caller * has a fib in mind because we are running out of ideas. * I think this should not happen in current code. * (kmacy would know). */ if (fibnum == -1) fibnum = curthread->td_proc->p_fibnum; /* last gasp */ if (la == NULL) { /* * We enter this block if rt0 was NULL, * or if rt found by rt_check() didn't have llinfo. * we should get a cloned route, which since it should * come from the local interface should have a ll entry. * It may be incomplete but that's ok. */ rt = arplookup(SIN(dst)->sin_addr.s_addr, 1, 0, fibnum); if (rt == NULL) { log(LOG_DEBUG, "arpresolve: can't allocate route for %s\n", inet_ntoa(SIN(dst)->sin_addr)); m_freem(m); return (EINVAL); /* XXX */ } la = (struct llinfo_arp *)rt->rt_llinfo; if (la == NULL) { RT_UNLOCK(rt); log(LOG_DEBUG, "arpresolve: can't allocate llinfo for %s\n", inet_ntoa(SIN(dst)->sin_addr)); m_freem(m); return (EINVAL); /* XXX */ } } sdl = SDL(rt->rt_gateway); /* * Check the address family and length is valid, the address * is resolved; otherwise, try to resolve. */ if ((rt->rt_expire == 0 || rt->rt_expire > time_uptime) && sdl->sdl_family == AF_LINK && sdl->sdl_alen != 0) { bcopy(LLADDR(sdl), desten, sdl->sdl_alen); /* * If entry has an expiry time and it is approaching, * send an ARP request. */ if ((rt->rt_expire != 0) && (time_uptime + la->la_preempt > rt->rt_expire)) { struct in_addr sin = SIN(rt->rt_ifa->ifa_addr)->sin_addr; la->la_preempt--; RT_UNLOCK(rt); arprequest(ifp, &sin, &SIN(dst)->sin_addr, IF_LLADDR(ifp)); return (0); } RT_UNLOCK(rt); return (0); } /* * If ARP is disabled or static on this interface, stop. * XXX * Probably should not allocate empty llinfo struct if we are * not going to be sending out an arp request. */ if (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) { RT_UNLOCK(rt); m_freem(m); return (EINVAL); } /* * There is an arptab entry, but no ethernet address * response yet. Replace the held mbuf with this * latest one. */ if (m) { if (la->la_hold) m_freem(la->la_hold); la->la_hold = m; } KASSERT(rt->rt_expire > 0, ("sending ARP request for static entry")); /* * Return EWOULDBLOCK if we have tried less than arp_maxtries. It * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH * if we have already sent arp_maxtries ARP requests. Retransmit the * ARP request, but not faster than one request per second. */ if (la->la_asked < V_arp_maxtries) error = EWOULDBLOCK; /* First request. */ else error = (rt == rt0) ? EHOSTDOWN : EHOSTUNREACH; if (la->la_asked == 0 || rt->rt_expire != time_uptime) { struct in_addr sin = SIN(rt->rt_ifa->ifa_addr)->sin_addr; rt->rt_expire = time_uptime; callout_reset(&la->la_timer, hz, arptimer, rt); la->la_asked++; RT_UNLOCK(rt); arprequest(ifp, &sin, &SIN(dst)->sin_addr, IF_LLADDR(ifp)); } else RT_UNLOCK(rt); return (error); } /* * Common length and type checks are done here, * then the protocol-specific routine is called. */ static void arpintr(struct mbuf *m) { struct arphdr *ar; if (m->m_len < sizeof(struct arphdr) && ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) { log(LOG_ERR, "arp: runt packet -- m_pullup failed\n"); return; } ar = mtod(m, struct arphdr *); if (ntohs(ar->ar_hrd) != ARPHRD_ETHER && ntohs(ar->ar_hrd) != ARPHRD_IEEE802 && ntohs(ar->ar_hrd) != ARPHRD_ARCNET && ntohs(ar->ar_hrd) != ARPHRD_IEEE1394) { log(LOG_ERR, "arp: unknown hardware address format (0x%2D)\n", (unsigned char *)&ar->ar_hrd, ""); m_freem(m); return; } if (m->m_len < arphdr_len(ar)) { if ((m = m_pullup(m, arphdr_len(ar))) == NULL) { log(LOG_ERR, "arp: runt packet\n"); m_freem(m); return; } ar = mtod(m, struct arphdr *); } switch (ntohs(ar->ar_pro)) { #ifdef INET case ETHERTYPE_IP: in_arpinput(m); return; #endif } m_freem(m); } #ifdef INET /* * ARP for Internet protocols on 10 Mb/s Ethernet. * Algorithm is that given in RFC 826. * In addition, a sanity check is performed on the sender * protocol address, to catch impersonators. * We no longer handle negotiations for use of trailer protocol: * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent * along with IP replies if we wanted trailers sent to us, * and also sent them in response to IP replies. * This allowed either end to announce the desire to receive * trailer packets. * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, * but formerly didn't normally send requests. */ static int log_arp_wrong_iface = 1; static int log_arp_movements = 1; static int log_arp_permanent_modify = 1; SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW, &log_arp_wrong_iface, 0, "log arp packets arriving on the wrong interface"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW, &log_arp_movements, 0, "log arp replies from MACs different than the one in the cache"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW, &log_arp_permanent_modify, 0, "log arp replies from MACs different than the one in the permanent arp entry"); static void in_arpinput(struct mbuf *m) { struct arphdr *ah; struct ifnet *ifp = m->m_pkthdr.rcvif; struct llinfo_arp *la; struct rtentry *rt; struct ifaddr *ifa; struct in_ifaddr *ia; struct sockaddr_dl *sdl; struct sockaddr sa; struct in_addr isaddr, itaddr, myaddr; struct mbuf *hold; u_int8_t *enaddr = NULL; int op, rif_len; int req_len; int bridged = 0, is_bridge = 0; u_int fibnum; u_int goodfib = 0; int firstpass = 1; #ifdef DEV_CARP int carp_match = 0; #endif struct sockaddr_in sin; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = 0; INIT_VNET_INET(ifp->if_vnet); if (ifp->if_bridge) bridged = 1; if (ifp->if_type == IFT_BRIDGE) is_bridge = 1; req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr)); if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) { log(LOG_ERR, "in_arp: runt packet -- m_pullup failed\n"); return; } ah = mtod(m, struct arphdr *); op = ntohs(ah->ar_op); (void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr)); (void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr)); /* * For a bridge, we want to check the address irrespective * of the receive interface. (This will change slightly * when we have clusters of interfaces). * If the interface does not match, but the recieving interface * is part of carp, we call carp_iamatch to see if this is a * request for the virtual host ip. * XXX: This is really ugly! */ LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (((bridged && ia->ia_ifp->if_bridge != NULL) || (ia->ia_ifp == ifp)) && itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) goto match; #ifdef DEV_CARP if (ifp->if_carp != NULL && carp_iamatch(ifp->if_carp, ia, &isaddr, &enaddr) && itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { carp_match = 1; goto match; } #endif } LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash) if (((bridged && ia->ia_ifp->if_bridge != NULL) || (ia->ia_ifp == ifp)) && isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) goto match; #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ (ia->ia_ifp->if_bridge == ifp->if_softc && \ !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) && \ addr == ia->ia_addr.sin_addr.s_addr) /* * Check the case when bridge shares its MAC address with * some of its children, so packets are claimed by bridge * itself (bridge_input() does it first), but they are really * meant to be destined to the bridge member. */ if (is_bridge) { LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) { ifp = ia->ia_ifp; goto match; } } } #undef BDG_MEMBER_MATCHES_ARP /* * No match, use the first inet address on the receive interface * as a dummy address for the rest of the function. */ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { ia = ifatoia(ifa); goto match; } /* * If bridging, fall back to using any inet address. */ if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) goto drop; match: if (!enaddr) enaddr = (u_int8_t *)IF_LLADDR(ifp); myaddr = ia->ia_addr.sin_addr; if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen)) goto drop; /* it's from me, ignore it. */ if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) { log(LOG_ERR, "arp: link address is broadcast for IP address %s!\n", inet_ntoa(isaddr)); goto drop; } /* * Warn if another host is using the same IP address, but only if the * IP address isn't 0.0.0.0, which is used for DHCP only, in which * case we suppress the warning to avoid false positive complaints of * potential misconfiguration. */ if (!bridged && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) { log(LOG_ERR, "arp: %*D is using my IP address %s on %s!\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa(isaddr), ifp->if_xname); itaddr = myaddr; goto reply; } if (ifp->if_flags & IFF_STATICARP) goto reply; /* * We look for any FIB that has this address to find * the interface etc. * For sanity checks that are FIB independent we abort the loop. */ for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { rt = arplookup(isaddr.s_addr, itaddr.s_addr == myaddr.s_addr, 0, fibnum); if (rt == NULL) continue; sdl = SDL(rt->rt_gateway); /* Only call this once */ if (firstpass) { sin.sin_addr.s_addr = isaddr.s_addr; EVENTHANDLER_INVOKE(route_arp_update_event, rt, ar_sha(ah), (struct sockaddr *)&sin); } la = (struct llinfo_arp *)rt->rt_llinfo; if (la == NULL) { RT_UNLOCK(rt); continue; } if (firstpass) { /* The following is not an error when doing bridging. */ if (!bridged && rt->rt_ifp != ifp #ifdef DEV_CARP && (ifp->if_type != IFT_CARP || !carp_match) #endif ) { if (log_arp_wrong_iface) log(LOG_ERR, "arp: %s is on %s " "but got reply from %*D " "on %s\n", inet_ntoa(isaddr), rt->rt_ifp->if_xname, ifp->if_addrlen, (u_char *)ar_sha(ah), ":", ifp->if_xname); RT_UNLOCK(rt); break; } if (sdl->sdl_alen && bcmp(ar_sha(ah), LLADDR(sdl), sdl->sdl_alen)) { if (rt->rt_expire) { if (log_arp_movements) log(LOG_INFO, "arp: %s moved from %*D to %*D " "on %s\n", inet_ntoa(isaddr), ifp->if_addrlen, (u_char *)LLADDR(sdl), ":", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", ifp->if_xname); } else { RT_UNLOCK(rt); if (log_arp_permanent_modify) log(LOG_ERR, "arp: %*D attempts to " "modify permanent entry " "for %s on %s\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa(isaddr), ifp->if_xname); break; } } /* * sanity check for the address length. * XXX this does not work for protocols * with variable address length. -is */ if (sdl->sdl_alen && sdl->sdl_alen != ah->ar_hln) { log(LOG_WARNING, "arp from %*D: new addr len %d, was %d", ifp->if_addrlen, (u_char *) ar_sha(ah), ":", ah->ar_hln, sdl->sdl_alen); } if (ifp->if_addrlen != ah->ar_hln) { log(LOG_WARNING, "arp from %*D: addr len: " "new %d, i/f %d (ignored)", ifp->if_addrlen, (u_char *) ar_sha(ah), ":", ah->ar_hln, ifp->if_addrlen); RT_UNLOCK(rt); break; } firstpass = 0; goodfib = fibnum; } /* Copy in the information received. */ (void)memcpy(LLADDR(sdl), ar_sha(ah), sdl->sdl_alen = ah->ar_hln); /* * If we receive an arp from a token-ring station over * a token-ring nic then try to save the source routing info. * XXXMRT Only minimal Token Ring support for MRT. * Only do this on the first pass as if modifies the mbuf. */ if (ifp->if_type == IFT_ISO88025) { struct iso88025_header *th = NULL; struct iso88025_sockaddr_dl_data *trld; /* force the fib loop to end after this pass */ fibnum = rt_numfibs - 1; th = (struct iso88025_header *)m->m_pkthdr.header; trld = SDL_ISO88025(sdl); rif_len = TR_RCF_RIFLEN(th->rcf); if ((th->iso88025_shost[0] & TR_RII) && (rif_len > 2)) { trld->trld_rcf = th->rcf; trld->trld_rcf ^= htons(TR_RCF_DIR); memcpy(trld->trld_route, th->rd, rif_len - 2); trld->trld_rcf &= ~htons(TR_RCF_BCST_MASK); /* * Set up source routing information for * reply packet (XXX) */ m->m_data -= rif_len; m->m_len += rif_len; m->m_pkthdr.len += rif_len; } else { th->iso88025_shost[0] &= ~TR_RII; trld->trld_rcf = 0; } m->m_data -= 8; m->m_len += 8; m->m_pkthdr.len += 8; th->rcf = trld->trld_rcf; } if (rt->rt_expire) { rt->rt_expire = time_uptime + V_arpt_keep; callout_reset(&la->la_timer, hz * V_arpt_keep, arptimer, rt); } la->la_asked = 0; la->la_preempt = V_arp_maxtries; hold = la->la_hold; la->la_hold = NULL; RT_UNLOCK(rt); if (hold != NULL) (*ifp->if_output)(ifp, hold, rt_key(rt), rt); } /* end of FIB loop */ reply: /* * Decide if we have to respond to something. */ if (op != ARPOP_REQUEST) goto drop; if (itaddr.s_addr == myaddr.s_addr) { /* Shortcut.. the receiving interface is the target. */ (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); } else { /* It's not asking for our address. But it still may * be something we should answer. * * XXX MRT * We assume that link level info is independent of * the table used and so we use whichever we can and don't * have a better option. */ /* Have we been asked to proxy for the target. */ rt = arplookup(itaddr.s_addr, 0, SIN_PROXY, goodfib); if (rt == NULL) { /* Nope, only intersted now if proxying everything. */ struct sockaddr_in sin; if (!V_arp_proxyall) goto drop; bzero(&sin, sizeof sin); sin.sin_family = AF_INET; sin.sin_len = sizeof sin; sin.sin_addr = itaddr; /* XXX MRT use table 0 for arp reply */ rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0); if (!rt) goto drop; /* * Don't send proxies for nodes on the same interface * as this one came out of, or we'll get into a fight * over who claims what Ether address. */ if (rt->rt_ifp == ifp) { rtfree(rt); goto drop; } (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); rtfree(rt); /* * Also check that the node which sent the ARP packet * is on the the interface we expect it to be on. This * avoids ARP chaos if an interface is connected to the * wrong network. */ sin.sin_addr = isaddr; /* XXX MRT use table 0 for arp checks */ rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0); if (!rt) goto drop; if (rt->rt_ifp != ifp) { log(LOG_INFO, "arp_proxy: ignoring request" " from %s via %s, expecting %s\n", inet_ntoa(isaddr), ifp->if_xname, rt->rt_ifp->if_xname); rtfree(rt); goto drop; } rtfree(rt); #ifdef DEBUG_PROXY printf("arp: proxying for %s\n", inet_ntoa(itaddr)); #endif } else { /* * Return proxied ARP replies only on the interface * or bridge cluster where this network resides. * Otherwise we may conflict with the host we are * proxying for. */ if (rt->rt_ifp != ifp && (rt->rt_ifp->if_bridge != ifp->if_bridge || ifp->if_bridge == NULL)) { RT_UNLOCK(rt); goto drop; } sdl = SDL(rt->rt_gateway); (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), LLADDR(sdl), ah->ar_hln); RT_UNLOCK(rt); } } if (itaddr.s_addr == myaddr.s_addr && IN_LINKLOCAL(ntohl(itaddr.s_addr))) { /* RFC 3927 link-local IPv4; always reply by broadcast. */ #ifdef DEBUG_LINKLOCAL printf("arp: sending reply for link-local addr %s\n", inet_ntoa(itaddr)); #endif m->m_flags |= M_BCAST; m->m_flags &= ~M_MCAST; } else { /* default behaviour; never reply by broadcast. */ m->m_flags &= ~(M_BCAST|M_MCAST); } (void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln); (void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln); ah->ar_op = htons(ARPOP_REPLY); ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */ m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln); m->m_pkthdr.len = m->m_len; sa.sa_family = AF_ARP; sa.sa_len = 2; (*ifp->if_output)(ifp, m, &sa, (struct rtentry *)0); return; drop: m_freem(m); } #endif /* * Lookup or enter a new address in arptab. */ static struct rtentry * arplookup(u_long addr, int create, int proxy, int fibnum) { struct rtentry *rt; struct sockaddr_inarp sin; const char *why = 0; bzero(&sin, sizeof(sin)); sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; sin.sin_addr.s_addr = addr; if (proxy) sin.sin_other = SIN_PROXY; rt = in_rtalloc1((struct sockaddr *)&sin, create, 0UL, fibnum); if (rt == 0) return (0); if (rt->rt_flags & RTF_GATEWAY) why = "host is not on local network"; else if ((rt->rt_flags & RTF_LLINFO) == 0) why = "could not allocate llinfo"; else if (rt->rt_gateway->sa_family != AF_LINK) why = "gateway route is not ours"; if (why) { #define ISDYNCLONE(_rt) \ (((_rt)->rt_flags & (RTF_STATIC | RTF_WASCLONED)) == RTF_WASCLONED) if (create) log(LOG_DEBUG, "arplookup %s failed: %s\n", inet_ntoa(sin.sin_addr), why); /* * If there are no references to this Layer 2 route, * and it is a cloned route, and not static, and * arplookup() is creating the route, then purge * it from the routing table as it is probably bogus. */ if (rt->rt_refcnt == 1 && ISDYNCLONE(rt)) rtexpunge(rt); RTFREE_LOCKED(rt); return (0); #undef ISDYNCLONE } else { RT_REMREF(rt); return (rt); } } void arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) { if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) arprequest(ifp, &IA_SIN(ifa)->sin_addr, &IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp)); ifa->ifa_rtrequest = arp_rtrequest; ifa->ifa_flags |= RTF_CLONING; } void arp_ifinit2(struct ifnet *ifp, struct ifaddr *ifa, u_char *enaddr) { if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) arprequest(ifp, &IA_SIN(ifa)->sin_addr, &IA_SIN(ifa)->sin_addr, enaddr); ifa->ifa_rtrequest = arp_rtrequest; ifa->ifa_flags |= RTF_CLONING; } static void arp_init(void) { + INIT_VNET_INET(curvnet); + + V_arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ + V_arp_maxtries = 5; + V_useloopback = 1; /* use loopback interface for local traffic */ + V_arp_proxyall = 0; arpintrq.ifq_maxlen = 50; mtx_init(&arpintrq.ifq_mtx, "arp_inq", NULL, MTX_DEF); netisr_register(NETISR_ARP, arpintr, &arpintrq, 0); } SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0); Index: head/sys/netinet/igmp.c =================================================================== --- head/sys/netinet/igmp.c (revision 185087) +++ head/sys/netinet/igmp.c (revision 185088) @@ -1,534 +1,538 @@ /*- * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ /* * Internet Group Management Protocol (IGMP) routines. * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb 1995. * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. * * MULTICAST Revision: 3.5.1.4 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); static struct router_info *find_rti(struct ifnet *ifp); static void igmp_sendpkt(struct in_multi *, int, unsigned long); +#ifdef VIMAGE_GLOBALS static struct igmpstat igmpstat; +#endif SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW, igmpstat, igmpstat, ""); /* * igmp_mtx protects all mutable global variables in igmp.c, as well as the * data fields in struct router_info. In general, a router_info structure * will be valid as long as the referencing struct in_multi is valid, so no * reference counting is used. We allow unlocked reads of router_info data * when accessed via an in_multi read-only. */ -static struct mtx igmp_mtx; +#ifdef VIMAGE_GLOBALS static SLIST_HEAD(, router_info) router_info_head; +#endif +static struct mtx igmp_mtx; static int igmp_timers_are_running; /* * XXXRW: can we define these such that these can be made const? In any * case, these shouldn't be changed after igmp_init() and therefore don't * need locking. */ static u_long igmp_all_hosts_group; static u_long igmp_all_rtrs_group; static struct mbuf *router_alert; static struct route igmprt; #ifdef IGMP_DEBUG #define IGMP_PRINTF(x) printf(x) #else #define IGMP_PRINTF(x) #endif void igmp_init(void) { INIT_VNET_INET(curvnet); struct ipoption *ra; /* * To avoid byte-swapping the same value over and over again. */ igmp_all_hosts_group = htonl(INADDR_ALLHOSTS_GROUP); igmp_all_rtrs_group = htonl(INADDR_ALLRTRS_GROUP); igmp_timers_are_running = 0; /* * Construct a Router Alert option to use in outgoing packets. */ MGET(router_alert, M_DONTWAIT, MT_DATA); ra = mtod(router_alert, struct ipoption *); ra->ipopt_dst.s_addr = 0; ra->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ ra->ipopt_list[1] = 0x04; /* 4 bytes long */ ra->ipopt_list[2] = 0x00; ra->ipopt_list[3] = 0x00; router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1]; mtx_init(&igmp_mtx, "igmp_mtx", NULL, MTX_DEF); SLIST_INIT(&V_router_info_head); } static struct router_info * find_rti(struct ifnet *ifp) { INIT_VNET_INET(ifp->if_vnet); struct router_info *rti; mtx_assert(&igmp_mtx, MA_OWNED); IGMP_PRINTF("[igmp.c, _find_rti] --> entering \n"); SLIST_FOREACH(rti, &V_router_info_head, rti_list) { if (rti->rti_ifp == ifp) { IGMP_PRINTF( "[igmp.c, _find_rti] --> found old entry \n"); return (rti); } } rti = malloc(sizeof *rti, M_IGMP, M_NOWAIT); if (rti == NULL) { IGMP_PRINTF("[igmp.c, _find_rti] --> no memory for entry\n"); return (NULL); } rti->rti_ifp = ifp; rti->rti_type = IGMP_V2_ROUTER; rti->rti_time = 0; SLIST_INSERT_HEAD(&V_router_info_head, rti, rti_list); IGMP_PRINTF("[igmp.c, _find_rti] --> created an entry \n"); return (rti); } void igmp_input(register struct mbuf *m, int off) { register int iphlen = off; register struct igmp *igmp; register struct ip *ip; register int igmplen; register struct ifnet *ifp = m->m_pkthdr.rcvif; register int minlen; register struct in_multi *inm; register struct in_ifaddr *ia; struct in_multistep step; struct router_info *rti; int timer; /** timer value in the igmp query header **/ INIT_VNET_INET(ifp->if_vnet); ++V_igmpstat.igps_rcv_total; ip = mtod(m, struct ip *); igmplen = ip->ip_len; /* * Validate lengths. */ if (igmplen < IGMP_MINLEN) { ++V_igmpstat.igps_rcv_tooshort; m_freem(m); return; } minlen = iphlen + IGMP_MINLEN; if ((m->m_flags & M_EXT || m->m_len < minlen) && (m = m_pullup(m, minlen)) == 0) { ++V_igmpstat.igps_rcv_tooshort; return; } /* * Validate checksum. */ m->m_data += iphlen; m->m_len -= iphlen; igmp = mtod(m, struct igmp *); if (in_cksum(m, igmplen)) { ++V_igmpstat.igps_rcv_badsum; m_freem(m); return; } m->m_data -= iphlen; m->m_len += iphlen; ip = mtod(m, struct ip *); timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; /* * In the IGMPv2 specification, there are 3 states and a flag. * * In Non-Member state, we simply don't have a membership record. * In Delaying Member state, our timer is running (inm->inm_timer). * In Idle Member state, our timer is not running (inm->inm_timer==0). * * The flag is inm->inm_state, it is set to IGMP_OTHERMEMBER if we * have heard a report from another member, or IGMP_IREPORTEDLAST if * I sent the last report. */ switch (igmp->igmp_type) { case IGMP_MEMBERSHIP_QUERY: ++V_igmpstat.igps_rcv_queries; if (ifp->if_flags & IFF_LOOPBACK) break; if (igmp->igmp_code == 0) { /* * Old router. Remember that the querier on this * interface is old, and set the timer to the value * in RFC 1112. */ mtx_lock(&igmp_mtx); rti = find_rti(ifp); if (rti == NULL) { mtx_unlock(&igmp_mtx); m_freem(m); return; } rti->rti_type = IGMP_V1_ROUTER; rti->rti_time = 0; mtx_unlock(&igmp_mtx); timer = IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ; if (ip->ip_dst.s_addr != igmp_all_hosts_group || igmp->igmp_group.s_addr != 0) { ++V_igmpstat.igps_rcv_badqueries; m_freem(m); return; } } else { /* * New router. Simply do the new validity check. */ if (igmp->igmp_group.s_addr != 0 && !IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { ++V_igmpstat.igps_rcv_badqueries; m_freem(m); return; } } /* * - Start the timers in all of our membership records that * the query applies to for the interface on which the * query arrived excl. those that belong to the "all-hosts" * group (224.0.0.1). * - Restart any timer that is already running but has a * value longer than the requested timeout. * - Use the value specified in the query message as the * maximum timeout. */ IN_MULTI_LOCK(); IN_FIRST_MULTI(step, inm); while (inm != NULL) { if (inm->inm_ifp == ifp && inm->inm_addr.s_addr != igmp_all_hosts_group && (igmp->igmp_group.s_addr == 0 || igmp->igmp_group.s_addr == inm->inm_addr.s_addr)) { if (inm->inm_timer == 0 || inm->inm_timer > timer) { inm->inm_timer = IGMP_RANDOM_DELAY(timer); igmp_timers_are_running = 1; } } IN_NEXT_MULTI(step, inm); } IN_MULTI_UNLOCK(); break; case IGMP_V1_MEMBERSHIP_REPORT: case IGMP_V2_MEMBERSHIP_REPORT: /* * For fast leave to work, we have to know that we are the * last person to send a report for this group. Reports can * potentially get looped back if we are a multicast router, * so discard reports sourced by me. */ IFP_TO_IA(ifp, ia); if (ia != NULL && ip->ip_src.s_addr == IA_SIN(ia)->sin_addr.s_addr) break; ++V_igmpstat.igps_rcv_reports; if (ifp->if_flags & IFF_LOOPBACK) break; if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { ++V_igmpstat.igps_rcv_badreports; m_freem(m); return; } /* * KLUDGE: if the IP source address of the report has an * unspecified (i.e., zero) subnet number, as is allowed for * a booting host, replace it with the correct subnet number * so that a process-level multicast routing daemon can * determine which subnet it arrived from. This is necessary * to compensate for the lack of any way for a process to * determine the arrival interface of an incoming packet. */ if ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) == 0) { if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); } /* * If we belong to the group being reported, stop our timer * for that group. */ IN_MULTI_LOCK(); IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm); if (inm != NULL) { inm->inm_timer = 0; ++V_igmpstat.igps_rcv_ourreports; inm->inm_state = IGMP_OTHERMEMBER; } IN_MULTI_UNLOCK(); break; } /* * Pass all valid IGMP packets up to any process(es) listening on a * raw IGMP socket. */ rip_input(m, off); } void igmp_joingroup(struct in_multi *inm) { IN_MULTI_LOCK_ASSERT(); if (inm->inm_addr.s_addr == igmp_all_hosts_group || inm->inm_ifp->if_flags & IFF_LOOPBACK) { inm->inm_timer = 0; inm->inm_state = IGMP_OTHERMEMBER; } else { mtx_lock(&igmp_mtx); inm->inm_rti = find_rti(inm->inm_ifp); mtx_unlock(&igmp_mtx); if (inm->inm_rti != NULL) { igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_MAX_HOST_REPORT_DELAY*PR_FASTHZ); inm->inm_state = IGMP_IREPORTEDLAST; igmp_timers_are_running = 1; } /* XXX handling of failure case? */ } } void igmp_leavegroup(struct in_multi *inm) { IN_MULTI_LOCK_ASSERT(); if (inm->inm_state == IGMP_IREPORTEDLAST && inm->inm_addr.s_addr != igmp_all_hosts_group && !(inm->inm_ifp->if_flags & IFF_LOOPBACK) && inm->inm_rti->rti_type != IGMP_V1_ROUTER) igmp_sendpkt(inm, IGMP_V2_LEAVE_GROUP, igmp_all_rtrs_group); } void igmp_fasttimo(void) { VNET_ITERATOR_DECL(vnet_iter); register struct in_multi *inm; struct in_multistep step; /* * Quick check to see if any work needs to be done, in order to * minimize the overhead of fasttimo processing. */ if (!igmp_timers_are_running) return; IN_MULTI_LOCK(); igmp_timers_are_running = 0; VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INIT_VNET_INET(vnet_iter); IN_FIRST_MULTI(step, inm); while (inm != NULL) { if (inm->inm_timer == 0) { /* do nothing */ } else if (--inm->inm_timer == 0) { igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); inm->inm_state = IGMP_IREPORTEDLAST; } else { igmp_timers_are_running = 1; } IN_NEXT_MULTI(step, inm); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); IN_MULTI_UNLOCK(); } void igmp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); struct router_info *rti; IGMP_PRINTF("[igmp.c,_slowtimo] -- > entering \n"); mtx_lock(&igmp_mtx); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INIT_VNET_INET(vnet_iter); SLIST_FOREACH(rti, &V_router_info_head, rti_list) { if (rti->rti_type == IGMP_V1_ROUTER) { rti->rti_time++; if (rti->rti_time >= IGMP_AGE_THRESHOLD) rti->rti_type = IGMP_V2_ROUTER; } } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); mtx_unlock(&igmp_mtx); IGMP_PRINTF("[igmp.c,_slowtimo] -- > exiting \n"); } static void igmp_sendpkt(struct in_multi *inm, int type, unsigned long addr) { INIT_VNET_NET(curvnet); INIT_VNET_INET(curvnet); struct mbuf *m; struct igmp *igmp; struct ip *ip; struct ip_moptions imo; IN_MULTI_LOCK_ASSERT(); MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) return; m->m_pkthdr.rcvif = V_loif; #ifdef MAC mac_netinet_igmp_send(inm->inm_ifp, m); #endif m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN; MH_ALIGN(m, IGMP_MINLEN + sizeof(struct ip)); m->m_data += sizeof(struct ip); m->m_len = IGMP_MINLEN; igmp = mtod(m, struct igmp *); igmp->igmp_type = type; igmp->igmp_code = 0; igmp->igmp_group = inm->inm_addr; igmp->igmp_cksum = 0; igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = sizeof(struct ip) + IGMP_MINLEN; ip->ip_off = 0; ip->ip_p = IPPROTO_IGMP; ip->ip_src.s_addr = INADDR_ANY; ip->ip_dst.s_addr = addr ? addr : igmp->igmp_group.s_addr; imo.imo_multicast_ifp = inm->inm_ifp; imo.imo_multicast_ttl = 1; imo.imo_multicast_vif = -1; /* * Request loopback of the report if we are acting as a multicast * router, so that the process-level routing daemon can hear it. */ imo.imo_multicast_loop = (V_ip_mrouter != NULL); /* * XXX: Do we have to worry about reentrancy here? Don't think so. */ ip_output(m, router_alert, &igmprt, 0, &imo, NULL); ++V_igmpstat.igps_snd_reports; } Index: head/sys/netinet/in.c =================================================================== --- head/sys/netinet/in.c (revision 185087) +++ head/sys/netinet/in.c (revision 185088) @@ -1,1015 +1,1017 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2001 WIDE Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_carp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int in_mask2len(struct in_addr *); static void in_len2mask(struct in_addr *, int); static int in_lifaddr_ioctl(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); static int in_addprefix(struct in_ifaddr *, int); static int in_scrubprefix(struct in_ifaddr *); static void in_socktrim(struct sockaddr_in *); static int in_ifinit(struct ifnet *, struct in_ifaddr *, struct sockaddr_in *, int); static void in_purgemaddrs(struct ifnet *); -static int subnetsarelocal = 0; +#ifdef VIMAGE_GLOBALS +static int subnetsarelocal; +static int sameprefixcarponly; +extern struct inpcbinfo ripcbinfo; +extern struct inpcbinfo udbinfo; +#endif + SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW, subnetsarelocal, 0, "Treat all subnets as directly connected"); -static int sameprefixcarponly = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW, sameprefixcarponly, 0, "Refuse to create same prefixes on different interfaces"); - -extern struct inpcbinfo ripcbinfo; -extern struct inpcbinfo udbinfo; /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). If subnetsarelocal * is true, this includes other subnets of the local net. * Otherwise, it includes only the directly-connected (sub)nets. */ int in_localaddr(struct in_addr in) { INIT_VNET_INET(curvnet); register u_long i = ntohl(in.s_addr); register struct in_ifaddr *ia; if (V_subnetsarelocal) { TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) if ((i & ia->ia_netmask) == ia->ia_net) return (1); } else { TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) if ((i & ia->ia_subnetmask) == ia->ia_subnet) return (1); } return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in_localip(struct in_addr in) { INIT_VNET_INET(curvnet); struct in_ifaddr *ia; LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) return (1); } return (0); } /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ int in_canforward(struct in_addr in) { register u_long i = ntohl(in.s_addr); register u_long net; if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i)) return (0); if (IN_CLASSA(i)) { net = i & IN_CLASSA_NET; if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) return (0); } return (1); } /* * Trim a mask in a sockaddr */ static void in_socktrim(struct sockaddr_in *ap) { register char *cplim = (char *) &ap->sin_addr; register char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) if (*cp) { (ap)->sin_len = cp - (char *) (ap) + 1; break; } } static int in_mask2len(mask) struct in_addr *mask; { int x, y; u_char *p; p = (u_char *)mask; for (x = 0; x < sizeof(*mask); x++) { if (p[x] != 0xff) break; } y = 0; if (x < sizeof(*mask)) { for (y = 0; y < 8; y++) { if ((p[x] & (0x80 >> y)) == 0) break; } } return (x * 8 + y); } static void in_len2mask(struct in_addr *mask, int len) { int i; u_char *p; p = (u_char *)mask; bzero(mask, sizeof(*mask)); for (i = 0; i < len / 8; i++) p[i] = 0xff; if (len % 8) p[i] = (0xff00 >> (len % 8)) & 0xff; } /* * Generic internet control operations (ioctl's). * Ifp is 0 if not an interface-specific ioctl. */ /* ARGSUSED */ int in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { INIT_VNET_INET(curvnet); /* both so and ifp can be NULL here! */ register struct ifreq *ifr = (struct ifreq *)data; register struct in_ifaddr *ia, *iap; register struct ifaddr *ifa; struct in_addr allhosts_addr; struct in_addr dst; struct in_ifaddr *oia; struct in_aliasreq *ifra = (struct in_aliasreq *)data; struct sockaddr_in oldaddr; int error, hostIsNew, iaIsNew, maskIsNew, s; int iaIsFirst; ia = NULL; iaIsFirst = 0; iaIsNew = 0; allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); switch (cmd) { case SIOCALIFADDR: if (td != NULL) { error = priv_check(td, PRIV_NET_ADDIFADDR); if (error) return (error); } if (ifp == NULL) return (EINVAL); return in_lifaddr_ioctl(so, cmd, data, ifp, td); case SIOCDLIFADDR: if (td != NULL) { error = priv_check(td, PRIV_NET_DELIFADDR); if (error) return (error); } if (ifp == NULL) return (EINVAL); return in_lifaddr_ioctl(so, cmd, data, ifp, td); case SIOCGLIFADDR: if (ifp == NULL) return (EINVAL); return in_lifaddr_ioctl(so, cmd, data, ifp, td); } /* * Find address for this interface, if it exists. * * If an alias address was specified, find that one instead of * the first one on the interface, if possible. */ if (ifp != NULL) { dst = ((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr; LIST_FOREACH(iap, INADDR_HASH(dst.s_addr), ia_hash) if (iap->ia_ifp == ifp && iap->ia_addr.sin_addr.s_addr == dst.s_addr) { ia = iap; break; } if (ia == NULL) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { iap = ifatoia(ifa); if (iap->ia_addr.sin_family == AF_INET) { ia = iap; break; } } if (ia == NULL) iaIsFirst = 1; } switch (cmd) { case SIOCAIFADDR: case SIOCDIFADDR: if (ifp == NULL) return (EADDRNOTAVAIL); if (ifra->ifra_addr.sin_family == AF_INET) { for (oia = ia; ia; ia = TAILQ_NEXT(ia, ia_link)) { if (ia->ia_ifp == ifp && ia->ia_addr.sin_addr.s_addr == ifra->ifra_addr.sin_addr.s_addr) break; } if ((ifp->if_flags & IFF_POINTOPOINT) && (cmd == SIOCAIFADDR) && (ifra->ifra_dstaddr.sin_addr.s_addr == INADDR_ANY)) { return (EDESTADDRREQ); } } if (cmd == SIOCDIFADDR && ia == NULL) return (EADDRNOTAVAIL); /* FALLTHROUGH */ case SIOCSIFADDR: case SIOCSIFNETMASK: case SIOCSIFDSTADDR: if (td != NULL) { error = priv_check(td, (cmd == SIOCDIFADDR) ? PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR); if (error) return (error); } if (ifp == NULL) return (EADDRNOTAVAIL); if (ia == NULL) { ia = (struct in_ifaddr *) malloc(sizeof *ia, M_IFADDR, M_WAITOK | M_ZERO); if (ia == NULL) return (ENOBUFS); /* * Protect from ipintr() traversing address list * while we're modifying it. */ s = splnet(); ifa = &ia->ia_ifa; IFA_LOCK_INIT(ifa); ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; ifa->ifa_refcnt = 1; TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); ia->ia_sockmask.sin_len = 8; ia->ia_sockmask.sin_family = AF_INET; if (ifp->if_flags & IFF_BROADCAST) { ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr); ia->ia_broadaddr.sin_family = AF_INET; } ia->ia_ifp = ifp; TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); splx(s); iaIsNew = 1; } break; case SIOCSIFBRDADDR: if (td != NULL) { error = priv_check(td, PRIV_NET_ADDIFADDR); if (error) return (error); } /* FALLTHROUGH */ case SIOCGIFADDR: case SIOCGIFNETMASK: case SIOCGIFDSTADDR: case SIOCGIFBRDADDR: if (ia == NULL) return (EADDRNOTAVAIL); break; } switch (cmd) { case SIOCGIFADDR: *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr; return (0); case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) return (EINVAL); *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr; return (0); case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) return (EINVAL); *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr; return (0); case SIOCGIFNETMASK: *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask; return (0); case SIOCSIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) return (EINVAL); oldaddr = ia->ia_dstaddr; ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr; if (ifp->if_ioctl != NULL) { IFF_LOCKGIANT(ifp); error = (*ifp->if_ioctl)(ifp, SIOCSIFDSTADDR, (caddr_t)ia); IFF_UNLOCKGIANT(ifp); if (error) { ia->ia_dstaddr = oldaddr; return (error); } } if (ia->ia_flags & IFA_ROUTE) { ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr; rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP); } return (0); case SIOCSIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) return (EINVAL); ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr; return (0); case SIOCSIFADDR: error = in_ifinit(ifp, ia, (struct sockaddr_in *) &ifr->ifr_addr, 1); if (error != 0 && iaIsNew) break; if (error == 0) { if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST) != 0) in_addmulti(&allhosts_addr, ifp); EVENTHANDLER_INVOKE(ifaddr_event, ifp); } return (0); case SIOCSIFNETMASK: ia->ia_sockmask.sin_addr = ifra->ifra_addr.sin_addr; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); return (0); case SIOCAIFADDR: maskIsNew = 0; hostIsNew = 1; error = 0; if (ia->ia_addr.sin_family == AF_INET) { if (ifra->ifra_addr.sin_len == 0) { ifra->ifra_addr = ia->ia_addr; hostIsNew = 0; } else if (ifra->ifra_addr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) hostIsNew = 0; } if (ifra->ifra_mask.sin_len) { in_ifscrub(ifp, ia); ia->ia_sockmask = ifra->ifra_mask; ia->ia_sockmask.sin_family = AF_INET; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); maskIsNew = 1; } if ((ifp->if_flags & IFF_POINTOPOINT) && (ifra->ifra_dstaddr.sin_family == AF_INET)) { in_ifscrub(ifp, ia); ia->ia_dstaddr = ifra->ifra_dstaddr; maskIsNew = 1; /* We lie; but the effect's the same */ } if (ifra->ifra_addr.sin_family == AF_INET && (hostIsNew || maskIsNew)) error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0); if (error != 0 && iaIsNew) break; if ((ifp->if_flags & IFF_BROADCAST) && (ifra->ifra_broadaddr.sin_family == AF_INET)) ia->ia_broadaddr = ifra->ifra_broadaddr; if (error == 0) { if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST) != 0) in_addmulti(&allhosts_addr, ifp); EVENTHANDLER_INVOKE(ifaddr_event, ifp); } return (error); case SIOCDIFADDR: /* * in_ifscrub kills the interface route. */ in_ifscrub(ifp, ia); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 1); EVENTHANDLER_INVOKE(ifaddr_event, ifp); error = 0; break; default: if (ifp == NULL || ifp->if_ioctl == NULL) return (EOPNOTSUPP); IFF_LOCKGIANT(ifp); error = (*ifp->if_ioctl)(ifp, cmd, data); IFF_UNLOCKGIANT(ifp); return (error); } /* * Protect from ipintr() traversing address list while we're modifying * it. */ s = splnet(); TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); if (ia->ia_addr.sin_family == AF_INET) { LIST_REMOVE(ia, ia_hash); /* * If this is the last IPv4 address configured on this * interface, leave the all-hosts group. * XXX: This is quite ugly because of locking and structure. */ oia = NULL; IFP_TO_IA(ifp, oia); if (oia == NULL) { struct in_multi *inm; IFF_LOCKGIANT(ifp); IN_MULTI_LOCK(); IN_LOOKUP_MULTI(allhosts_addr, ifp, inm); if (inm != NULL) in_delmulti_locked(inm); IN_MULTI_UNLOCK(); IFF_UNLOCKGIANT(ifp); } } IFAFREE(&ia->ia_ifa); splx(s); return (error); } /* * SIOC[GAD]LIFADDR. * SIOCGLIFADDR: get first address. (?!?) * SIOCGLIFADDR with IFLR_PREFIX: * get first address that matches the specified prefix. * SIOCALIFADDR: add the specified address. * SIOCALIFADDR with IFLR_PREFIX: * EINVAL since we can't deduce hostid part of the address. * SIOCDLIFADDR: delete the specified address. * SIOCDLIFADDR with IFLR_PREFIX: * delete the first address that matches the specified prefix. * return values: * EINVAL on invalid parameters * EADDRNOTAVAIL on prefix match failed/specified address not found * other values may be returned from in_ioctl() */ static int in_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct if_laddrreq *iflr = (struct if_laddrreq *)data; struct ifaddr *ifa; /* sanity checks */ if (data == NULL || ifp == NULL) { panic("invalid argument to in_lifaddr_ioctl"); /*NOTRECHED*/ } switch (cmd) { case SIOCGLIFADDR: /* address must be specified on GET with IFLR_PREFIX */ if ((iflr->flags & IFLR_PREFIX) == 0) break; /*FALLTHROUGH*/ case SIOCALIFADDR: case SIOCDLIFADDR: /* address must be specified on ADD and DELETE */ if (iflr->addr.ss_family != AF_INET) return (EINVAL); if (iflr->addr.ss_len != sizeof(struct sockaddr_in)) return (EINVAL); /* XXX need improvement */ if (iflr->dstaddr.ss_family && iflr->dstaddr.ss_family != AF_INET) return (EINVAL); if (iflr->dstaddr.ss_family && iflr->dstaddr.ss_len != sizeof(struct sockaddr_in)) return (EINVAL); break; default: /*shouldn't happen*/ return (EOPNOTSUPP); } if (sizeof(struct in_addr) * 8 < iflr->prefixlen) return (EINVAL); switch (cmd) { case SIOCALIFADDR: { struct in_aliasreq ifra; if (iflr->flags & IFLR_PREFIX) return (EINVAL); /* copy args to in_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */ bzero(&ifra, sizeof(ifra)); bcopy(iflr->iflr_name, ifra.ifra_name, sizeof(ifra.ifra_name)); bcopy(&iflr->addr, &ifra.ifra_addr, iflr->addr.ss_len); if (iflr->dstaddr.ss_family) { /*XXX*/ bcopy(&iflr->dstaddr, &ifra.ifra_dstaddr, iflr->dstaddr.ss_len); } ifra.ifra_mask.sin_family = AF_INET; ifra.ifra_mask.sin_len = sizeof(struct sockaddr_in); in_len2mask(&ifra.ifra_mask.sin_addr, iflr->prefixlen); return (in_control(so, SIOCAIFADDR, (caddr_t)&ifra, ifp, td)); } case SIOCGLIFADDR: case SIOCDLIFADDR: { struct in_ifaddr *ia; struct in_addr mask, candidate, match; struct sockaddr_in *sin; bzero(&mask, sizeof(mask)); bzero(&match, sizeof(match)); if (iflr->flags & IFLR_PREFIX) { /* lookup a prefix rather than address. */ in_len2mask(&mask, iflr->prefixlen); sin = (struct sockaddr_in *)&iflr->addr; match.s_addr = sin->sin_addr.s_addr; match.s_addr &= mask.s_addr; /* if you set extra bits, that's wrong */ if (match.s_addr != sin->sin_addr.s_addr) return (EINVAL); } else { /* on getting an address, take the 1st match */ /* on deleting an address, do exact match */ if (cmd != SIOCGLIFADDR) { in_len2mask(&mask, 32); sin = (struct sockaddr_in *)&iflr->addr; match.s_addr = sin->sin_addr.s_addr; } } TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (match.s_addr == 0) break; candidate.s_addr = ((struct sockaddr_in *)&ifa->ifa_addr)->sin_addr.s_addr; candidate.s_addr &= mask.s_addr; if (candidate.s_addr == match.s_addr) break; } if (ifa == NULL) return (EADDRNOTAVAIL); ia = (struct in_ifaddr *)ifa; if (cmd == SIOCGLIFADDR) { /* fill in the if_laddrreq structure */ bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin_len); if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { bcopy(&ia->ia_dstaddr, &iflr->dstaddr, ia->ia_dstaddr.sin_len); } else bzero(&iflr->dstaddr, sizeof(iflr->dstaddr)); iflr->prefixlen = in_mask2len(&ia->ia_sockmask.sin_addr); iflr->flags = 0; /*XXX*/ return (0); } else { struct in_aliasreq ifra; /* fill in_aliasreq and do ioctl(SIOCDIFADDR_IN6) */ bzero(&ifra, sizeof(ifra)); bcopy(iflr->iflr_name, ifra.ifra_name, sizeof(ifra.ifra_name)); bcopy(&ia->ia_addr, &ifra.ifra_addr, ia->ia_addr.sin_len); if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { bcopy(&ia->ia_dstaddr, &ifra.ifra_dstaddr, ia->ia_dstaddr.sin_len); } bcopy(&ia->ia_sockmask, &ifra.ifra_dstaddr, ia->ia_sockmask.sin_len); return (in_control(so, SIOCDIFADDR, (caddr_t)&ifra, ifp, td)); } } } return (EOPNOTSUPP); /*just for safety*/ } /* * Delete any existing route for an interface. */ void in_ifscrub(struct ifnet *ifp, struct in_ifaddr *ia) { in_scrubprefix(ia); } /* * Initialize an interface's internet address * and routing table entry. */ static int in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, int scrub) { INIT_VNET_INET(ifp->if_vnet); register u_long i = ntohl(sin->sin_addr.s_addr); struct sockaddr_in oldaddr; int s = splimp(), flags = RTF_UP, error = 0; oldaddr = ia->ia_addr; if (oldaddr.sin_family == AF_INET) LIST_REMOVE(ia, ia_hash); ia->ia_addr = *sin; if (ia->ia_addr.sin_family == AF_INET) LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ if (ifp->if_ioctl != NULL) { IFF_LOCKGIANT(ifp); error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); IFF_UNLOCKGIANT(ifp); if (error) { splx(s); /* LIST_REMOVE(ia, ia_hash) is done in in_control */ ia->ia_addr = oldaddr; if (ia->ia_addr.sin_family == AF_INET) LIST_INSERT_HEAD(INADDR_HASH( ia->ia_addr.sin_addr.s_addr), ia, ia_hash); else /* * If oldaddr family is not AF_INET (e.g. * interface has been just created) in_control * does not call LIST_REMOVE, and we end up * with bogus ia entries in hash */ LIST_REMOVE(ia, ia_hash); return (error); } } splx(s); if (scrub) { ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr; in_ifscrub(ifp, ia); ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; } if (IN_CLASSA(i)) ia->ia_netmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) ia->ia_netmask = IN_CLASSB_NET; else ia->ia_netmask = IN_CLASSC_NET; /* * The subnet mask usually includes at least the standard network part, * but may may be smaller in the case of supernetting. * If it is set, we believe it. */ if (ia->ia_subnetmask == 0) { ia->ia_subnetmask = ia->ia_netmask; ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); } else ia->ia_netmask &= ia->ia_subnetmask; ia->ia_net = i & ia->ia_netmask; ia->ia_subnet = i & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); #ifdef DEV_CARP /* * XXX: carp(4) does not have interface route */ if (ifp->if_type == IFT_CARP) return (0); #endif /* * Add route for the network. */ ia->ia_ifa.ifa_metric = ifp->if_metric; if (ifp->if_flags & IFF_BROADCAST) { ia->ia_broadaddr.sin_addr.s_addr = htonl(ia->ia_subnet | ~ia->ia_subnetmask); ia->ia_netbroadcast.s_addr = htonl(ia->ia_net | ~ ia->ia_netmask); } else if (ifp->if_flags & IFF_LOOPBACK) { ia->ia_dstaddr = ia->ia_addr; flags |= RTF_HOST; } else if (ifp->if_flags & IFF_POINTOPOINT) { if (ia->ia_dstaddr.sin_family != AF_INET) return (0); flags |= RTF_HOST; } if ((error = in_addprefix(ia, flags)) != 0) return (error); return (error); } #define rtinitflags(x) \ ((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \ ? RTF_HOST : 0) /* * Check if we have a route for the given prefix already or add one accordingly. */ static int in_addprefix(struct in_ifaddr *target, int flags) { INIT_VNET_INET(curvnet); struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error; if ((flags & RTF_HOST) != 0) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_addr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } /* * If we got a matching prefix route inserted by other * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { if (V_sameprefixcarponly && target->ia_ifp->if_type != IFT_CARP && ia->ia_ifp->if_type != IFT_CARP) return (EEXIST); else return (0); } } /* * No-one seem to have this prefix route, so we try to insert it. */ error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags); if (!error) target->ia_flags |= IFA_ROUTE; return (error); } /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ static int in_scrubprefix(struct in_ifaddr *target) { INIT_VNET_INET(curvnet); struct in_ifaddr *ia; struct in_addr prefix, mask, p; int error; if ((target->ia_flags & IFA_ROUTE) == 0) return (0); if (rtinitflags(target)) prefix = target->ia_dstaddr.sin_addr; else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) p = ia->ia_dstaddr.sin_addr; else { p = ia->ia_addr.sin_addr; p.s_addr &= ia->ia_sockmask.sin_addr.s_addr; } if (prefix.s_addr != p.s_addr) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. * * XXX: a special case for carp(4) interface */ if ((ia->ia_flags & IFA_ROUTE) == 0 #ifdef DEV_CARP && (ia->ia_ifp->if_type != IFT_CARP) #endif ) { rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); target->ia_flags &= ~IFA_ROUTE; error = rtinit(&ia->ia_ifa, (int)RTM_ADD, rtinitflags(ia) | RTF_UP); if (error == 0) ia->ia_flags |= IFA_ROUTE; return (error); } } /* * As no-one seem to have this prefix, we can remove the route. */ rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); target->ia_flags &= ~IFA_ROUTE; return (0); } #undef rtinitflags /* * Return 1 if the address might be a local broadcast address. */ int in_broadcast(struct in_addr in, struct ifnet *ifp) { register struct ifaddr *ifa; u_long t; if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) return (0); t = ntohl(in.s_addr); /* * Look through the list of addresses for a match * with a broadcast address. */ #define ia ((struct in_ifaddr *)ifa) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || in.s_addr == ia->ia_netbroadcast.s_addr || /* * Check for old-style (host 0) broadcast. */ t == ia->ia_subnet || t == ia->ia_net) && /* * Check for an all one subnetmask. These * only exist when an interface gets a secondary * address. */ ia->ia_subnetmask != (u_long)0xffffffff) return (1); return (0); #undef ia } /* * Delete all IPv4 multicast address records, and associated link-layer * multicast address records, associated with ifp. */ static void in_purgemaddrs(struct ifnet *ifp) { INIT_VNET_INET(ifp->if_vnet); struct in_multi *inm; struct in_multi *oinm; #ifdef DIAGNOSTIC printf("%s: purging ifp %p\n", __func__, ifp); #endif IFF_LOCKGIANT(ifp); IN_MULTI_LOCK(); LIST_FOREACH_SAFE(inm, &V_in_multihead, inm_link, oinm) { if (inm->inm_ifp == ifp) in_delmulti_locked(inm); } IN_MULTI_UNLOCK(); IFF_UNLOCKGIANT(ifp); } /* * On interface removal, clean up IPv4 data structures hung off of the ifnet. */ void in_ifdetach(struct ifnet *ifp) { INIT_VNET_INET(ifp->if_vnet); in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); in_purgemaddrs(ifp); } Index: head/sys/netinet/in_gif.c =================================================================== --- head/sys/netinet/in_gif.c (revision 185087) +++ head/sys/netinet/in_gif.c (revision 185088) @@ -1,435 +1,437 @@ /* $KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $ */ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mrouting.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #ifdef MROUTING #include #endif /* MROUTING */ #include static int gif_validate4(const struct ip *, struct gif_softc *, struct ifnet *); extern struct domain inetdomain; struct protosw in_gif_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = 0/* IPPROTO_IPV[46] */, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = in_gif_input, .pr_output = (pr_output_t*)rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; -static int ip_gif_ttl = GIF_TTL; +#ifdef VIMAGE_GLOBALS +extern int ip_gif_ttl; +#endif SYSCTL_V_INT(V_NET, vnet_gif, _net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW, ip_gif_ttl, 0, ""); int in_gif_output(struct ifnet *ifp, int family, struct mbuf *m) { INIT_VNET_GIF(ifp->if_vnet); struct gif_softc *sc = ifp->if_softc; struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst; struct sockaddr_in *sin_src = (struct sockaddr_in *)sc->gif_psrc; struct sockaddr_in *sin_dst = (struct sockaddr_in *)sc->gif_pdst; struct ip iphdr; /* capsule IP header, host byte ordered */ struct etherip_header eiphdr; int proto, error; u_int8_t tos; GIF_LOCK_ASSERT(sc); if (sin_src == NULL || sin_dst == NULL || sin_src->sin_family != AF_INET || sin_dst->sin_family != AF_INET) { m_freem(m); return EAFNOSUPPORT; } switch (family) { #ifdef INET case AF_INET: { struct ip *ip; proto = IPPROTO_IPV4; if (m->m_len < sizeof(*ip)) { m = m_pullup(m, sizeof(*ip)); if (!m) return ENOBUFS; } ip = mtod(m, struct ip *); tos = ip->ip_tos; break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { struct ip6_hdr *ip6; proto = IPPROTO_IPV6; if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) return ENOBUFS; } ip6 = mtod(m, struct ip6_hdr *); tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; break; } #endif /* INET6 */ case AF_LINK: proto = IPPROTO_ETHERIP; eiphdr.eip_ver = ETHERIP_VERSION & ETHERIP_VER_VERS_MASK; eiphdr.eip_pad = 0; /* prepend Ethernet-in-IP header */ M_PREPEND(m, sizeof(struct etherip_header), M_DONTWAIT); if (m && m->m_len < sizeof(struct etherip_header)) m = m_pullup(m, sizeof(struct etherip_header)); if (m == NULL) return ENOBUFS; bcopy(&eiphdr, mtod(m, struct etherip_header *), sizeof(struct etherip_header)); break; default: #ifdef DEBUG printf("in_gif_output: warning: unknown family %d passed\n", family); #endif m_freem(m); return EAFNOSUPPORT; } bzero(&iphdr, sizeof(iphdr)); iphdr.ip_src = sin_src->sin_addr; /* bidirectional configured tunnel mode */ if (sin_dst->sin_addr.s_addr != INADDR_ANY) iphdr.ip_dst = sin_dst->sin_addr; else { m_freem(m); return ENETUNREACH; } iphdr.ip_p = proto; /* version will be set in ip_output() */ iphdr.ip_ttl = V_ip_gif_ttl; iphdr.ip_len = m->m_pkthdr.len + sizeof(struct ip); ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED : ECN_NOCARE, &iphdr.ip_tos, &tos); /* prepend new IP header */ M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); if (m && m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { printf("ENOBUFS in in_gif_output %d\n", __LINE__); return ENOBUFS; } bcopy(&iphdr, mtod(m, struct ip *), sizeof(struct ip)); M_SETFIB(m, sc->gif_fibnum); if (dst->sin_family != sin_dst->sin_family || dst->sin_addr.s_addr != sin_dst->sin_addr.s_addr) { /* cache route doesn't match */ bzero(dst, sizeof(*dst)); dst->sin_family = sin_dst->sin_family; dst->sin_len = sizeof(struct sockaddr_in); dst->sin_addr = sin_dst->sin_addr; if (sc->gif_ro.ro_rt) { RTFREE(sc->gif_ro.ro_rt); sc->gif_ro.ro_rt = NULL; } #if 0 GIF2IFP(sc)->if_mtu = GIF_MTU; #endif } if (sc->gif_ro.ro_rt == NULL) { in_rtalloc_ign(&sc->gif_ro, 0, sc->gif_fibnum); if (sc->gif_ro.ro_rt == NULL) { m_freem(m); return ENETUNREACH; } /* if it constitutes infinite encapsulation, punt. */ if (sc->gif_ro.ro_rt->rt_ifp == ifp) { m_freem(m); return ENETUNREACH; /* XXX */ } #if 0 ifp->if_mtu = sc->gif_ro.ro_rt->rt_ifp->if_mtu - sizeof(struct ip); #endif } error = ip_output(m, NULL, &sc->gif_ro, 0, NULL, NULL); if (!(GIF2IFP(sc)->if_flags & IFF_LINK0) && sc->gif_ro.ro_rt != NULL) { RTFREE(sc->gif_ro.ro_rt); sc->gif_ro.ro_rt = NULL; } return (error); } void in_gif_input(struct mbuf *m, int off) { INIT_VNET_INET(curvnet); struct ifnet *gifp = NULL; struct gif_softc *sc; struct ip *ip; int af; u_int8_t otos; int proto; ip = mtod(m, struct ip *); proto = ip->ip_p; sc = (struct gif_softc *)encap_getarg(m); if (sc == NULL) { m_freem(m); V_ipstat.ips_nogif++; return; } gifp = GIF2IFP(sc); if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { m_freem(m); V_ipstat.ips_nogif++; return; } otos = ip->ip_tos; m_adj(m, off); switch (proto) { #ifdef INET case IPPROTO_IPV4: { struct ip *ip; af = AF_INET; if (m->m_len < sizeof(*ip)) { m = m_pullup(m, sizeof(*ip)); if (!m) return; } ip = mtod(m, struct ip *); if (ip_ecn_egress((gifp->if_flags & IFF_LINK1) ? ECN_ALLOWED : ECN_NOCARE, &otos, &ip->ip_tos) == 0) { m_freem(m); return; } break; } #endif #ifdef INET6 case IPPROTO_IPV6: { struct ip6_hdr *ip6; u_int8_t itos, oitos; af = AF_INET6; if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) return; } ip6 = mtod(m, struct ip6_hdr *); itos = oitos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; if (ip_ecn_egress((gifp->if_flags & IFF_LINK1) ? ECN_ALLOWED : ECN_NOCARE, &otos, &itos) == 0) { m_freem(m); return; } if (itos != oitos) { ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t)itos << 20); } break; } #endif /* INET6 */ case IPPROTO_ETHERIP: af = AF_LINK; break; default: V_ipstat.ips_nogif++; m_freem(m); return; } gif_input(m, af, gifp); return; } /* * validate outer address. */ static int gif_validate4(const struct ip *ip, struct gif_softc *sc, struct ifnet *ifp) { INIT_VNET_INET(curvnet); struct sockaddr_in *src, *dst; struct in_ifaddr *ia4; src = (struct sockaddr_in *)sc->gif_psrc; dst = (struct sockaddr_in *)sc->gif_pdst; /* check for address match */ if (src->sin_addr.s_addr != ip->ip_dst.s_addr || dst->sin_addr.s_addr != ip->ip_src.s_addr) return 0; /* martian filters on outer source - NOT done in ip_input! */ if (IN_MULTICAST(ntohl(ip->ip_src.s_addr))) return 0; switch ((ntohl(ip->ip_src.s_addr) & 0xff000000) >> 24) { case 0: case 127: case 255: return 0; } /* reject packets with broadcast on source */ TAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; if (ip->ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) return 0; } /* ingress filters on outer source */ if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0 && ifp) { struct sockaddr_in sin; struct rtentry *rt; bzero(&sin, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr = ip->ip_src; /* XXX MRT check for the interface we would use on output */ rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, sc->gif_fibnum); if (!rt || rt->rt_ifp != ifp) { #if 0 log(LOG_WARNING, "%s: packet from 0x%x dropped " "due to ingress filter\n", if_name(GIF2IFP(sc)), (u_int32_t)ntohl(sin.sin_addr.s_addr)); #endif if (rt) RTFREE_LOCKED(rt); return 0; } RTFREE_LOCKED(rt); } return 32 * 2; } /* * we know that we are in IFF_UP, outer address available, and outer family * matched the physical addr family. see gif_encapcheck(). */ int gif_encapcheck4(const struct mbuf *m, int off, int proto, void *arg) { struct ip ip; struct gif_softc *sc; struct ifnet *ifp; /* sanity check done in caller */ sc = (struct gif_softc *)arg; /* LINTED const cast */ m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); ifp = ((m->m_flags & M_PKTHDR) != 0) ? m->m_pkthdr.rcvif : NULL; return gif_validate4(&ip, sc, ifp); } int in_gif_attach(struct gif_softc *sc) { sc->encap_cookie4 = encap_attach_func(AF_INET, -1, gif_encapcheck, &in_gif_protosw, sc); if (sc->encap_cookie4 == NULL) return EEXIST; return 0; } int in_gif_detach(struct gif_softc *sc) { int error; error = encap_detach(sc->encap_cookie4); if (error == 0) sc->encap_cookie4 = NULL; return error; } Index: head/sys/netinet/in_mcast.c =================================================================== --- head/sys/netinet/in_mcast.c (revision 185087) +++ head/sys/netinet/in_mcast.c (revision 185088) @@ -1,1831 +1,1833 @@ /*- * Copyright (c) 2007 Bruce M. Simpson. * Copyright (c) 2005 Robert N. M. Watson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv4 multicast socket, group, and socket option processing module. * Until further notice, this file requires INET to compile. * TODO: Make this infrastructure independent of address family. * TODO: Teach netinet6 to use this code. * TODO: Hook up SSM logic to IGMPv3/MLDv2. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in sin; #ifdef INET6 struct sockaddr_in6 sin6; #endif }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group"); static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options"); static MALLOC_DEFINE(M_IPMSOURCE, "in_msource", "IPv4 multicast source filter"); /* * The IPv4 multicast list (in_multihead and associated structures) are * protected by the global in_multi_mtx. See in_var.h for more details. For * now, in_multi_mtx is marked as recursible due to IGMP's calling back into * ip_output() to send IGMP packets while holding the lock; this probably is * not quite desirable. */ +#ifdef VIMAGE_GLOBALS struct in_multihead in_multihead; /* XXX BSS initialization */ +#endif struct mtx in_multi_mtx; MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF | MTX_RECURSE); /* * Functions with non-static linkage defined in this file should be * declared in in_var.h: * imo_match_group() * imo_match_source() * in_addmulti() * in_delmulti() * in_delmulti_locked() * and ip_var.h: * inp_freemoptions() * inp_getmoptions() * inp_setmoptions() */ static int imo_grow(struct ip_moptions *); static int imo_join_source(struct ip_moptions *, size_t, sockunion_t *); static int imo_leave_source(struct ip_moptions *, size_t, sockunion_t *); static int inp_change_source_filter(struct inpcb *, struct sockopt *); static struct ip_moptions * inp_findmoptions(struct inpcb *); static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); /* * Resize the ip_moptions vector to the next power-of-two minus 1. * May be called with locks held; do not sleep. */ static int imo_grow(struct ip_moptions *imo) { struct in_multi **nmships; struct in_multi **omships; struct in_mfilter *nmfilters; struct in_mfilter *omfilters; size_t idx; size_t newmax; size_t oldmax; nmships = NULL; nmfilters = NULL; omships = imo->imo_membership; omfilters = imo->imo_mfilters; oldmax = imo->imo_max_memberships; newmax = ((oldmax + 1) * 2) - 1; if (newmax <= IP_MAX_MEMBERSHIPS) { nmships = (struct in_multi **)realloc(omships, sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT); nmfilters = (struct in_mfilter *)realloc(omfilters, sizeof(struct in_mfilter) * newmax, M_IPMSOURCE, M_NOWAIT); if (nmships != NULL && nmfilters != NULL) { /* Initialize newly allocated source filter heads. */ for (idx = oldmax; idx < newmax; idx++) { nmfilters[idx].imf_fmode = MCAST_EXCLUDE; nmfilters[idx].imf_nsources = 0; TAILQ_INIT(&nmfilters[idx].imf_sources); } imo->imo_max_memberships = newmax; imo->imo_membership = nmships; imo->imo_mfilters = nmfilters; } } if (nmships == NULL || nmfilters == NULL) { if (nmships != NULL) free(nmships, M_IPMOPTS); if (nmfilters != NULL) free(nmfilters, M_IPMSOURCE); return (ETOOMANYREFS); } return (0); } /* * Add a source to a multicast filter list. * Assumes the associated inpcb is locked. */ static int imo_join_source(struct ip_moptions *imo, size_t gidx, sockunion_t *src) { struct in_msource *ims, *nims; struct in_mfilter *imf; KASSERT(src->ss.ss_family == AF_INET, ("%s: !AF_INET", __func__)); KASSERT(imo->imo_mfilters != NULL, ("%s: imo_mfilters vector not allocated", __func__)); imf = &imo->imo_mfilters[gidx]; if (imf->imf_nsources == IP_MAX_SOURCE_FILTER) return (ENOBUFS); ims = imo_match_source(imo, gidx, &src->sa); if (ims != NULL) return (EADDRNOTAVAIL); /* Do not sleep with inp lock held. */ nims = malloc(sizeof(struct in_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOBUFS); nims->ims_addr = src->ss; TAILQ_INSERT_TAIL(&imf->imf_sources, nims, ims_next); imf->imf_nsources++; return (0); } static int imo_leave_source(struct ip_moptions *imo, size_t gidx, sockunion_t *src) { struct in_msource *ims; struct in_mfilter *imf; KASSERT(src->ss.ss_family == AF_INET, ("%s: !AF_INET", __func__)); KASSERT(imo->imo_mfilters != NULL, ("%s: imo_mfilters vector not allocated", __func__)); imf = &imo->imo_mfilters[gidx]; if (imf->imf_nsources == IP_MAX_SOURCE_FILTER) return (ENOBUFS); ims = imo_match_source(imo, gidx, &src->sa); if (ims == NULL) return (EADDRNOTAVAIL); TAILQ_REMOVE(&imf->imf_sources, ims, ims_next); free(ims, M_IPMSOURCE); imf->imf_nsources--; return (0); } /* * Find an IPv4 multicast group entry for this ip_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ size_t imo_match_group(struct ip_moptions *imo, struct ifnet *ifp, struct sockaddr *group) { sockunion_t *gsa; struct in_multi **pinm; int idx; int nmships; gsa = (sockunion_t *)group; /* The imo_membership array may be lazy allocated. */ if (imo->imo_membership == NULL || imo->imo_num_memberships == 0) return (-1); nmships = imo->imo_num_memberships; pinm = &imo->imo_membership[0]; for (idx = 0; idx < nmships; idx++, pinm++) { if (*pinm == NULL) continue; #if 0 printf("%s: trying ifp = %p, inaddr = %s ", __func__, ifp, inet_ntoa(gsa->sin.sin_addr)); printf("against %p, %s\n", (*pinm)->inm_ifp, inet_ntoa((*pinm)->inm_addr)); #endif if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) && (*pinm)->inm_addr.s_addr == gsa->sin.sin_addr.s_addr) { break; } } if (idx >= nmships) idx = -1; return (idx); } /* * Find a multicast source entry for this imo which matches * the given group index for this socket, and source address. */ struct in_msource * imo_match_source(struct ip_moptions *imo, size_t gidx, struct sockaddr *src) { struct in_mfilter *imf; struct in_msource *ims, *pims; KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__)); KASSERT(gidx != -1 && gidx < imo->imo_num_memberships, ("%s: invalid index %d\n", __func__, (int)gidx)); /* The imo_mfilters array may be lazy allocated. */ if (imo->imo_mfilters == NULL) return (NULL); pims = NULL; imf = &imo->imo_mfilters[gidx]; TAILQ_FOREACH(ims, &imf->imf_sources, ims_next) { /* * Perform bitwise comparison of two IPv4 addresses. * TODO: Do the same for IPv6. * Do not use sa_equal() for this as it is not aware of * deeper structure in sockaddr_in or sockaddr_in6. */ if (((struct sockaddr_in *)&ims->ims_addr)->sin_addr.s_addr == ((struct sockaddr_in *)src)->sin_addr.s_addr) { pims = ims; break; } } return (pims); } /* * Join an IPv4 multicast group. */ struct in_multi * in_addmulti(struct in_addr *ap, struct ifnet *ifp) { INIT_VNET_INET(ifp->if_vnet); struct in_multi *inm; inm = NULL; IFF_LOCKGIANT(ifp); IN_MULTI_LOCK(); IN_LOOKUP_MULTI(*ap, ifp, inm); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->inm_refcount >= 1, ("%s: bad refcount %d", __func__, inm->inm_refcount)); ++inm->inm_refcount; } else do { sockunion_t gsa; struct ifmultiaddr *ifma; struct in_multi *ninm; int error; memset(&gsa, 0, sizeof(gsa)); gsa.sin.sin_family = AF_INET; gsa.sin.sin_len = sizeof(struct sockaddr_in); gsa.sin.sin_addr = *ap; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. * If so, bump the refcount on the existing network-layer * group association and return it. */ error = if_addmulti(ifp, &gsa.sa, &ifma); if (error) break; if (ifma->ifma_protospec != NULL) { inm = (struct in_multi *)ifma->ifma_protospec; #ifdef INVARIANTS if (inm->inm_ifma != ifma || inm->inm_ifp != ifp || inm->inm_addr.s_addr != ap->s_addr) panic("%s: ifma is inconsistent", __func__); #endif ++inm->inm_refcount; break; } /* * A new membership is needed; construct it and * perform the IGMP join. */ ninm = malloc(sizeof(*ninm), M_IPMADDR, M_NOWAIT | M_ZERO); if (ninm == NULL) { if_delmulti_ifma(ifma); break; } ninm->inm_addr = *ap; ninm->inm_ifp = ifp; ninm->inm_ifma = ifma; ninm->inm_refcount = 1; ifma->ifma_protospec = ninm; LIST_INSERT_HEAD(&V_in_multihead, ninm, inm_link); igmp_joingroup(ninm); inm = ninm; } while (0); IN_MULTI_UNLOCK(); IFF_UNLOCKGIANT(ifp); return (inm); } /* * Leave an IPv4 multicast group. * It is OK to call this routine if the underlying ifnet went away. * * XXX: To deal with the ifp going away, we cheat; the link-layer code in net * will set ifma_ifp to NULL when the associated ifnet instance is detached * from the system. * * The only reason we need to violate layers and check ifma_ifp here at all * is because certain hardware drivers still require Giant to be held, * and it must always be taken before other locks. */ void in_delmulti(struct in_multi *inm) { struct ifnet *ifp; KASSERT(inm != NULL, ("%s: inm is NULL", __func__)); KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); IFF_LOCKGIANT(ifp); } IN_MULTI_LOCK(); in_delmulti_locked(inm); IN_MULTI_UNLOCK(); if (ifp != NULL) IFF_UNLOCKGIANT(ifp); } /* * Delete a multicast address record, with locks held. * * It is OK to call this routine if the ifp went away. * Assumes that caller holds the IN_MULTI lock, and that * Giant was taken before other locks if required by the hardware. */ void in_delmulti_locked(struct in_multi *inm) { struct ifmultiaddr *ifma; IN_MULTI_LOCK_ASSERT(); KASSERT(inm->inm_refcount >= 1, ("%s: freeing freed inm", __func__)); if (--inm->inm_refcount == 0) { igmp_leavegroup(inm); ifma = inm->inm_ifma; #ifdef DIAGNOSTIC if (bootverbose) printf("%s: purging ifma %p\n", __func__, ifma); #endif KASSERT(ifma->ifma_protospec == inm, ("%s: ifma_protospec != inm", __func__)); ifma->ifma_protospec = NULL; LIST_REMOVE(inm, inm_link); free(inm, M_IPMADDR); if_delmulti_ifma(ifma); } } /* * Block or unblock an ASM/SSM multicast source on an inpcb. */ static int inp_change_source_filter(struct inpcb *inp, struct sockopt *sopt) { INIT_VNET_NET(curvnet); INIT_VNET_INET(curvnet); struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; size_t idx; int error; int block; ifp = NULL; error = 0; block = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; if (mreqs.imr_interface.s_addr != INADDR_ANY) INADDR_TO_IFP(mreqs.imr_interface, ifp); if (sopt->sopt_name == IP_BLOCK_SOURCE) block = 1; #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: imr_interface = %s, ifp = %p\n", __func__, inet_ntoa(mreqs.imr_interface), ifp); } #endif break; } case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) block = 1; break; default: #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: unknown sopt_name %d\n", __func__, sopt->sopt_name); } #endif return (EOPNOTSUPP); break; } /* XXX INET6 */ if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Check if we are actually a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_locked; } KASSERT(imo->imo_mfilters != NULL, ("%s: imo_mfilters not allocated", __func__)); imf = &imo->imo_mfilters[idx]; /* * SSM multicast truth table for block/unblock operations. * * Operation Filter Mode Entry exists? Action * * block exclude no add source to filter * unblock include no add source to filter * block include no EINVAL * unblock exclude no EINVAL * block exclude yes EADDRNOTAVAIL * unblock include yes EADDRNOTAVAIL * block include yes remove source from filter * unblock exclude yes remove source from filter * * FreeBSD does not explicitly distinguish between ASM and SSM * mode sockets; all sockets are assumed to have a filter list. */ #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: imf_fmode is %s\n", __func__, imf->imf_fmode == MCAST_INCLUDE ? "include" : "exclude"); } #endif ims = imo_match_source(imo, idx, &ssa->sa); if (ims == NULL) { if ((block == 1 && imf->imf_fmode == MCAST_EXCLUDE) || (block == 0 && imf->imf_fmode == MCAST_INCLUDE)) { #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: adding %s to filter list\n", __func__, inet_ntoa(ssa->sin.sin_addr)); } #endif error = imo_join_source(imo, idx, ssa); } if ((block == 1 && imf->imf_fmode == MCAST_INCLUDE) || (block == 0 && imf->imf_fmode == MCAST_EXCLUDE)) { /* * If the socket is in inclusive mode: * the source is already blocked as it has no entry. * If the socket is in exclusive mode: * the source is already unblocked as it has no entry. */ #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: ims %p; %s already [un]blocked\n", __func__, ims, inet_ntoa(ssa->sin.sin_addr)); } #endif error = EINVAL; } } else { if ((block == 1 && imf->imf_fmode == MCAST_EXCLUDE) || (block == 0 && imf->imf_fmode == MCAST_INCLUDE)) { /* * If the socket is in exclusive mode: * the source is already blocked as it has an entry. * If the socket is in inclusive mode: * the source is already unblocked as it has an entry. */ #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: ims %p; %s already [un]blocked\n", __func__, ims, inet_ntoa(ssa->sin.sin_addr)); } #endif error = EADDRNOTAVAIL; } if ((block == 1 && imf->imf_fmode == MCAST_INCLUDE) || (block == 0 && imf->imf_fmode == MCAST_EXCLUDE)) { #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: removing %s from filter list\n", __func__, inet_ntoa(ssa->sin.sin_addr)); } #endif error = imo_leave_source(imo, idx, ssa); } } out_locked: INP_WUNLOCK(inp); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. */ static struct ip_moptions * inp_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; struct in_multi **immp; struct in_mfilter *imfp; size_t idx; INP_WLOCK(inp); if (inp->inp_moptions != NULL) return (inp->inp_moptions); INP_WUNLOCK(inp); imo = (struct ip_moptions *)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); immp = (struct in_multi **)malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS, M_WAITOK | M_ZERO); imfp = (struct in_mfilter *)malloc( sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS, M_IPMSOURCE, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; imo->imo_num_memberships = 0; imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; imo->imo_membership = immp; /* Initialize per-group source filters. */ for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) { imfp[idx].imf_fmode = MCAST_EXCLUDE; imfp[idx].imf_nsources = 0; TAILQ_INIT(&imfp[idx].imf_sources); } imo->imo_mfilters = imfp; INP_WLOCK(inp); if (inp->inp_moptions != NULL) { free(imfp, M_IPMSOURCE); free(immp, M_IPMOPTS); free(imo, M_IPMOPTS); return (inp->inp_moptions); } inp->inp_moptions = imo; return (imo); } /* * Discard the IP multicast options (and source filters). */ void inp_freemoptions(struct ip_moptions *imo) { struct in_mfilter *imf; struct in_msource *ims, *tims; size_t idx, nmships; KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__)); nmships = imo->imo_num_memberships; for (idx = 0; idx < nmships; ++idx) { in_delmulti(imo->imo_membership[idx]); if (imo->imo_mfilters != NULL) { imf = &imo->imo_mfilters[idx]; TAILQ_FOREACH_SAFE(ims, &imf->imf_sources, ims_next, tims) { TAILQ_REMOVE(&imf->imf_sources, ims, ims_next); free(ims, M_IPMSOURCE); imf->imf_nsources--; } KASSERT(imf->imf_nsources == 0, ("%s: did not free all imf_nsources", __func__)); } } if (imo->imo_mfilters != NULL) free(imo->imo_mfilters, M_IPMSOURCE); free(imo->imo_membership, M_IPMOPTS); free(imo, M_IPMOPTS); } /* * Atomically get source filters on a socket for an IPv4 multicast group. * Called with INP lock held; returns with lock released. */ static int inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { INIT_VNET_NET(curvnet); struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip_moptions *imo; struct in_mfilter *imf; struct in_msource *ims; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t idx; INP_WLOCK_ASSERT(inp); imo = inp->inp_moptions; KASSERT(imo != NULL, ("%s: null ip_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EINVAL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EINVAL); INP_WLOCK(inp); /* * Lookup group on the socket. */ gsa = (sockunion_t *)&msfr.msfr_group; idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } imf = &imo->imo_mfilters[idx]; msfr.msfr_fmode = imf->imf_fmode; msfr.msfr_nsrcs = imf->imf_nsources; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * msfr.msfr_nsrcs is always set to the total number of filter * entries which the kernel currently has for this group. */ tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { /* * Make a copy of the source vector so that we do not * thrash the inpcb lock whilst copying it out. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT); if (tss == NULL) { error = ENOBUFS; } else { ptss = tss; TAILQ_FOREACH(ims, &imf->imf_sources, ims_next) { memcpy(ptss++, &ims->ims_addr, sizeof(struct sockaddr_storage)); } } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); } if (error) return (error); error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) { INIT_VNET_INET(curvnet); struct ip_mreqn mreqn; struct ip_moptions *imo; struct ifnet *ifp; struct in_ifaddr *ia; int error, optval; u_char coptval; INP_WLOCK(inp); imo = inp->inp_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_IF: memset(&mreqn, 0, sizeof(struct ip_mreqn)); if (imo != NULL) { ifp = imo->imo_multicast_ifp; if (imo->imo_multicast_addr.s_addr != INADDR_ANY) { mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { mreqn.imr_ifindex = ifp->if_index; IFP_TO_IA(ifp, ia); if (ia != NULL) { mreqn.imr_address = IA_SIN(ia)->sin_addr; } } } INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { error = sooptcopyout(sopt, &mreqn, sizeof(struct ip_mreqn)); } else { error = sooptcopyout(sopt, &mreqn.imr_address, sizeof(struct in_addr)); } break; case IP_MULTICAST_TTL: if (imo == 0) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_LOOP: if (imo == 0) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MSFILTER: if (imo == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = inp_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Join an IPv4 multicast group, possibly with a source. */ static int inp_join_group(struct inpcb *inp, struct sockopt *sopt) { INIT_VNET_NET(curvnet); INIT_VNET_INET(curvnet); struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; size_t idx; int error; ifp = NULL; error = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: { struct ip_mreq_source mreqs; if (sopt->sopt_name == IP_ADD_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Do argument switcharoo from ip_mreq into * ip_mreq_source to avoid using two instances. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } /* * Obtain ifp. If no interface address was provided, * use the interface of the route in the unicast FIB for * the given multicast destination; usually, this is the * default route. * If this lookup fails, attempt to use the first non-loopback * interface with multicast capability in the system as a * last resort. The legacy IPv4 ASM API requires that we do * this in order to allow groups to be joined when the routing * table has not yet been populated during boot. * If all of these conditions fail, return EADDRNOTAVAIL, and * reject the IPv4 multicast join. */ if (mreqs.imr_interface.s_addr != INADDR_ANY) { INADDR_TO_IFP(mreqs.imr_interface, ifp); } else { struct route ro; ro.ro_rt = NULL; *(struct sockaddr_in *)&ro.ro_dst = gsa->sin; in_rtalloc_ign(&ro, RTF_CLONING, inp->inp_inc.inc_fibnum); if (ro.ro_rt != NULL) { ifp = ro.ro_rt->rt_ifp; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); RTFREE(ro.ro_rt); } else { struct in_ifaddr *ia; struct ifnet *mfp = NULL; TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { mfp = ia->ia_ifp; if (!(mfp->if_flags & IFF_LOOPBACK) && (mfp->if_flags & IFF_MULTICAST)) { ifp = mfp; break; } } } } #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: imr_interface = %s, ifp = %p\n", __func__, inet_ntoa(mreqs.imr_interface), ifp); } #endif break; } case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); /* * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. * XXX INET6 */ gsa->sin.sin_port = 0; if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); ssa->sin.sin_port = 0; } /* * Obtain the ifp. */ if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: unknown sopt_name %d\n", __func__, sopt->sopt_name); } #endif return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); /* * Check if we already hold membership of this group for this inpcb. * If so, we do not need to perform the initial join. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx != -1) { if (ssa->ss.ss_family != AF_UNSPEC) { /* * Attempting to join an ASM group (when already * an ASM or SSM member) is an error. */ error = EADDRNOTAVAIL; } else { imf = &imo->imo_mfilters[idx]; if (imf->imf_nsources == 0) { /* * Attempting to join an SSM group (when * already an ASM member) is an error. */ error = EINVAL; } else { /* * Attempting to join an SSM group (when * already an SSM member) means "add this * source to the inclusive filter list". */ error = imo_join_source(imo, idx, ssa); } } goto out_locked; } /* * Call imo_grow() to reallocate the membership and source filter * vectors if they are full. If the size would exceed the hard limit, * then we know we've really run out of entries. We keep the INP * lock held to avoid introducing a race condition. */ if (imo->imo_num_memberships == imo->imo_max_memberships) { error = imo_grow(imo); if (error) goto out_locked; } /* * So far, so good: perform the layer 3 join, layer 2 join, * and make an IGMP announcement if needed. */ inm = in_addmulti(&gsa->sin.sin_addr, ifp); if (inm == NULL) { error = ENOBUFS; goto out_locked; } idx = imo->imo_num_memberships; imo->imo_membership[idx] = inm; imo->imo_num_memberships++; KASSERT(imo->imo_mfilters != NULL, ("%s: imf_mfilters vector was not allocated", __func__)); imf = &imo->imo_mfilters[idx]; KASSERT(TAILQ_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); /* * If this is a new SSM group join (i.e. a source was specified * with this group), add this source to the filter list. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* * An initial SSM join implies that this socket's membership * of the multicast group is now in inclusive mode. */ imf->imf_fmode = MCAST_INCLUDE; error = imo_join_source(imo, idx, ssa); if (error) { /* * Drop inp lock before calling in_delmulti(), * to prevent a lock order reversal. */ --imo->imo_num_memberships; INP_WUNLOCK(inp); in_delmulti(inm); return (error); } } out_locked: INP_WUNLOCK(inp); return (error); } /* * Leave an IPv4 multicast group on an inpcb, possibly with a source. */ static int inp_leave_group(struct inpcb *inp, struct sockopt *sopt) { INIT_VNET_NET(curvnet); INIT_VNET_INET(curvnet); struct group_source_req gsr; struct ip_mreq_source mreqs; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims, *tims; struct in_multi *inm; size_t idx; int error; ifp = NULL; error = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: if (sopt->sopt_name == IP_DROP_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Swap interface and sourceaddr arguments, * as ip_mreq and ip_mreq_source are laid * out differently. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } if (gsa->sin.sin_addr.s_addr != INADDR_ANY) INADDR_TO_IFP(mreqs.imr_interface, ifp); #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: imr_interface = %s, ifp = %p\n", __func__, inet_ntoa(mreqs.imr_interface), ifp); } #endif break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); } if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: unknown sopt_name %d\n", __func__, sopt->sopt_name); } #endif return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Find the membership in the membership array. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { error = EADDRNOTAVAIL; goto out_locked; } imf = &imo->imo_mfilters[idx]; /* * If we were instructed only to leave a given source, do so. */ if (ssa->ss.ss_family != AF_UNSPEC) { if (imf->imf_nsources == 0 || imf->imf_fmode == MCAST_EXCLUDE) { /* * Attempting to SSM leave an ASM group * is an error; should use *_BLOCK_SOURCE instead. * Attempting to SSM leave a source in a group when * the socket is in 'exclude mode' is also an error. */ error = EINVAL; } else { error = imo_leave_source(imo, idx, ssa); } /* * If an error occurred, or this source is not the last * source in the group, do not leave the whole group. */ if (error || imf->imf_nsources > 0) goto out_locked; } /* * Give up the multicast address record to which the membership points. */ inm = imo->imo_membership[idx]; in_delmulti(inm); /* * Free any source filters for this group if they exist. * Revert inpcb to the default MCAST_EXCLUDE state. */ if (imo->imo_mfilters != NULL) { TAILQ_FOREACH_SAFE(ims, &imf->imf_sources, ims_next, tims) { TAILQ_REMOVE(&imf->imf_sources, ims, ims_next); free(ims, M_IPMSOURCE); imf->imf_nsources--; } KASSERT(imf->imf_nsources == 0, ("%s: imf_nsources not 0", __func__)); KASSERT(TAILQ_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); imf->imf_fmode = MCAST_EXCLUDE; } /* * Remove the gap in the membership array. */ for (++idx; idx < imo->imo_num_memberships; ++idx) imo->imo_membership[idx-1] = imo->imo_membership[idx]; imo->imo_num_memberships--; out_locked: INP_WUNLOCK(inp); return (error); } /* * Select the interface for transmitting IPv4 multicast datagrams. * * Either an instance of struct in_addr or an instance of struct ip_mreqn * may be passed to this socket option. An address of INADDR_ANY or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { INIT_VNET_NET(curvnet); struct in_addr addr; struct ip_mreqn mreqn; struct ifnet *ifp; struct ip_moptions *imo; int error; if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { /* * An interface index was specified using the * Linux-derived ip_mreqn structure. */ error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); if (error) return (error); if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex) return (EINVAL); if (mreqn.imr_ifindex == 0) { ifp = NULL; } else { ifp = ifnet_byindex(mreqn.imr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); } } else { /* * An interface was specified by IPv4 address. * This is the traditional BSD usage. */ error = sooptcopyin(sopt, &addr, sizeof(struct in_addr), sizeof(struct in_addr)); if (error) return (error); if (addr.s_addr == INADDR_ANY) { ifp = NULL; } else { INADDR_TO_IFP(addr, ifp); if (ifp == NULL) return (EADDRNOTAVAIL); } #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: ifp = %p, addr = %s\n", __func__, ifp, inet_ntoa(addr)); /* XXX INET6 */ } #endif } /* Reject interfaces which do not support multicast. */ if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); imo = inp_findmoptions(inp); imo->imo_multicast_ifp = ifp; imo->imo_multicast_addr.s_addr = INADDR_ANY; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv4 multicast group. */ static int inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { INIT_VNET_NET(curvnet); struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims, *tims; size_t idx; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > IP_MAX_SOURCE_FILTER || (msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE)) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); gsa->sin.sin_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); /* * Take the INP lock. * Check if this socket is a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_locked; } imf = &imo->imo_mfilters[idx]; #ifdef DIAGNOSTIC if (bootverbose) printf("%s: clearing source list\n", __func__); #endif /* * Remove any existing source filters. */ TAILQ_FOREACH_SAFE(ims, &imf->imf_sources, ims_next, tims) { TAILQ_REMOVE(&imf->imf_sources, ims, ims_next); free(ims, M_IPMSOURCE); imf->imf_nsources--; } KASSERT(imf->imf_nsources == 0, ("%s: source list not cleared", __func__)); /* * Apply any new source filters, if present. */ if (msfr.msfr_nsrcs > 0) { struct in_msource **pnims; struct in_msource *nims; struct sockaddr_storage *kss; struct sockaddr_storage *pkss; sockunion_t *psu; int i, j; /* * Drop the inp lock so we may sleep if we need to * in order to satisfy a malloc request. * We will re-take it before changing socket state. */ INP_WUNLOCK(inp); #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: loading %lu source list entries\n", __func__, (unsigned long)msfr.msfr_nsrcs); } #endif /* * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } /* * Perform argument checking on every sockaddr_storage * structure in the vector provided to us. Overwrite * fields which should not apply to source entries. * TODO: Check for duplicate sources on this pass. */ psu = (sockunion_t *)kss; for (i = 0; i < msfr.msfr_nsrcs; i++, psu++) { switch (psu->ss.ss_family) { case AF_INET: if (psu->sin.sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; } else { psu->sin.sin_port = 0; } break; #ifdef notyet case AF_INET6; if (psu->sin6.sin6_len != sizeof(struct sockaddr_in6)) { error = EINVAL; } else { psu->sin6.sin6_port = 0; psu->sin6.sin6_flowinfo = 0; } break; #endif default: error = EAFNOSUPPORT; break; } if (error) break; } if (error) { free(kss, M_TEMP); return (error); } /* * Allocate a block to track all the in_msource * entries we are about to allocate, in case we * abruptly need to free them. */ pnims = malloc(sizeof(struct in_msource *) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK | M_ZERO); /* * Allocate up to nsrcs individual chunks. * If we encounter an error, backtrack out of * all allocations cleanly; updates must be atomic. */ pkss = kss; nims = NULL; for (i = 0; i < msfr.msfr_nsrcs; i++, pkss++) { nims = malloc(sizeof(struct in_msource) * msfr.msfr_nsrcs, M_IPMSOURCE, M_WAITOK | M_ZERO); pnims[i] = nims; } if (i < msfr.msfr_nsrcs) { for (j = 0; j < i; j++) { if (pnims[j] != NULL) free(pnims[j], M_IPMSOURCE); } free(pnims, M_TEMP); free(kss, M_TEMP); return (ENOBUFS); } INP_UNLOCK_ASSERT(inp); /* * Finally, apply the filters to the socket. * Re-take the inp lock; we are changing socket state. */ pkss = kss; INP_WLOCK(inp); for (i = 0; i < msfr.msfr_nsrcs; i++, pkss++) { memcpy(&(pnims[i]->ims_addr), pkss, sizeof(struct sockaddr_storage)); TAILQ_INSERT_TAIL(&imf->imf_sources, pnims[i], ims_next); imf->imf_nsources++; } free(pnims, M_TEMP); free(kss, M_TEMP); } /* * Update the filter mode on the socket before releasing the inpcb. */ INP_WLOCK_ASSERT(inp); imf->imf_fmode = msfr.msfr_fmode; out_locked: INP_WUNLOCK(inp); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv4 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. */ int inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_moptions *imo; int error; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. * XXX Unlocked read of inp_socket believed OK. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IP_MULTICAST_VIF: { int vifi; /* * Select a multicast VIF for transmission. * Only useful if multicast forwarding is active. */ if (legal_vif_num == NULL) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int)); if (error) break; if (!legal_vif_num(vifi) && (vifi != -1)) { error = EINVAL; break; } imo = inp_findmoptions(inp); imo->imo_multicast_vif = vifi; INP_WUNLOCK(inp); break; } case IP_MULTICAST_IF: error = inp_set_multicast_if(inp, sopt); break; case IP_MULTICAST_TTL: { u_char ttl; /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &ttl, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int ittl; error = sooptcopyin(sopt, &ittl, sizeof(u_int), sizeof(u_int)); if (error) break; if (ittl > 255) { error = EINVAL; break; } ttl = (u_char)ittl; } imo = inp_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_WUNLOCK(inp); break; } case IP_MULTICAST_LOOP: { u_char loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &loop, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int iloop; error = sooptcopyin(sopt, &iloop, sizeof(u_int), sizeof(u_int)); if (error) break; loop = (u_char)iloop; } imo = inp_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_WUNLOCK(inp); break; } case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = inp_join_group(inp, sopt); break; case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = inp_leave_group(inp, sopt); break; case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_change_source_filter(inp, sopt); break; case IP_MSFILTER: error = inp_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } Index: head/sys/netinet/in_pcb.c =================================================================== --- head/sys/netinet/in_pcb.c (revision 185087) +++ head/sys/netinet/in_pcb.c (revision 185088) @@ -1,1784 +1,1786 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet6.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif /* INET6 */ #ifdef IPSEC #include #include #endif /* IPSEC */ #include +#ifdef VIMAGE_GLOBALS /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ -int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ -int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ -int ipport_firstauto = IPPORT_EPHEMERALFIRST; /* 10000 */ -int ipport_lastauto = IPPORT_EPHEMERALLAST; /* 65535 */ -int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ -int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ +int ipport_lowfirstauto; +int ipport_lowlastauto; +int ipport_firstauto; +int ipport_lastauto; +int ipport_hifirstauto; +int ipport_hilastauto; /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ -int ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */ -int ipport_reservedlow = 0; +int ipport_reservedhigh; +int ipport_reservedlow; /* Variables dealing with random ephemeral port allocation. */ -int ipport_randomized = 1; /* user controlled via sysctl */ -int ipport_randomcps = 10; /* user controlled via sysctl */ -int ipport_randomtime = 45; /* user controlled via sysctl */ -int ipport_stoprandom = 0; /* toggled by ipport_tick */ +int ipport_randomized; +int ipport_randomcps; +int ipport_randomtime; +int ipport_stoprandom; int ipport_tcpallocs; int ipport_tcplastcount; +#endif #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, reservedhigh, CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedhigh, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedlow, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW, ipport_randomized, 0, "Enable random port allocation"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW, ipport_randomcps, 0, "Maximum number of random port " "allocations before switching to a sequental one"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW, ipport_randomtime, 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { #ifdef INET6 INIT_VNET_INET6(curvnet); #endif struct inpcb *inp; int error; INP_INFO_WLOCK_ASSERT(pcbinfo); error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(inp, inp_zero_size); inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; SOCK_LOCK(so); mac_inpcb_create(so, inp); SOCK_UNLOCK(so); #endif #ifdef IPSEC error = ipsec_init_policy(so, &inp->inp_sp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO; if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif INP_WLOCK(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; #if defined(IPSEC) || defined(MAC) out: if (error != 0) { crfree(inp->inp_cred); uma_zfree(pcbinfo->ipi_zone, inp); } #endif return (error); } int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = inp->inp_lport == 0 && (nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0); error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { INIT_VNET_INET(inp->inp_vnet); struct socket *so = inp->inp_socket; unsigned short *lastport; struct sockaddr_in *sin; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); int error, prison = 0; int dorandom; /* * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ INP_INFO_LOCK_ASSERT(pcbinfo); INP_LOCK_ASSERT(inp); if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) wild = INPLOOKUP_WILDCARD; if (nam) { sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); #ifdef notdef /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); #endif if (sin->sin_addr.s_addr != INADDR_ANY) if (prison_ip(cred, 0, &sin->sin_addr.s_addr)) return(EINVAL); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if (so->so_options & SO_REUSEADDR) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); if (ifa_ifwithaddr((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); if (jailed(cred)) prison = 1; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT, 0) != 0) { t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, prison ? 0 : INPLOOKUP_WILDCARD, cred); /* * XXX * This entire block sorely needs a rewrite. */ if (t && ((t->inp_vflag & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_socket->so_options & SO_REUSEPORT) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); } if (prison && prison_ip(cred, 0, &sin->sin_addr.s_addr)) return (EADDRNOTAVAIL); t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, prison ? 0 : wild, cred); if (t && (t->inp_vflag & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(inp); if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); } else if (t && (reuseport & t->inp_socket->so_options) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || INP_SOCKAF(so) == INP_SOCKAF(t->inp_socket)) #endif return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { u_short first, last, aux; int count; if (laddr.s_addr != INADDR_ANY) if (prison_ip(cred, 0, &laddr.s_addr)) return (EINVAL); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); if (error) return error; first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * For UDP, use random port allocation as long as the user * allows it. For TCP (and as of yet unknown) connections, * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ if (V_ipport_randomized && (!V_ipport_stoprandom || pcbinfo == &V_udbinfo)) dorandom = 1; else dorandom = 0; /* * It makes no sense to do random port allocation if * we have the only port available. */ if (first == last) dorandom = 0; /* Make sure to not include UDP packets in the count. */ if (pcbinfo != &V_udbinfo) V_ipport_tcpallocs++; /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { aux = first; first = last; last = aux; } if (dorandom) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); } while (in_pcblookup_local(pcbinfo, laddr, lport, wild, cred)); } if (prison_ip(cred, 0, &laddr.s_addr)) return (EINVAL); *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, NULL, cred); if (error) return (error); /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } } /* Commit the remaining changes. */ inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; in_pcbrehash(inp); if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ static int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { struct in_ifaddr *ia; struct ifaddr *ifa; struct sockaddr *sa; struct sockaddr_in *sin; struct route sro; int error; KASSERT(laddr != NULL, ("%s: null laddr", __func__)); error = 0; ia = NULL; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) in_rtalloc_ign(&sro, RTF_CLONING, inp->inp_inc.inc_fibnum); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { struct ifnet *ifp; ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin)); if (ia == NULL) { error = ENETUNREACH; goto done; } if (cred == NULL || !jailed(cred)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } ifp = ia->ia_ifp; ia = NULL; TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (htonl(prison_getip(cred)) == sin->sin_addr.s_addr) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* 3. As a last resort return the 'default' jail address. */ laddr->s_addr = htonl(prison_getip(cred)); goto done; } /* * If the outgoing interface on the route found is not * a loopback interface, use the address from that interface. * In case of jails do those three steps: * 1. check if the interface address belongs to the jail. If so use it. * 2. check if we have any address on the outgoing interface * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { /* If not jailed, use the default returned. */ if (cred == NULL || !jailed(cred)) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr; if (htonl(prison_getip(cred)) == sin->sin_addr.s_addr) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* * 2. Check if we have any address on the outgoing interface * belonging to this jail. */ TAILQ_FOREACH(ifa, &sro.ro_rt->rt_ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (htonl(prison_getip(cred)) == sin->sin_addr.s_addr) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* 3. As a last resort return the 'default' jail address. */ laddr->s_addr = htonl(prison_getip(cred)); goto done; } /* * The outgoing interface is marked with 'loopback net', so a route * to ourselves is here. * Try to find the interface of the destination address and then * take the address from there. That interface is not necessarily * a loopback interface. * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { struct sockaddr_in sain; bzero(&sain, sizeof(struct sockaddr_in)); sain.sin_family = AF_INET; sain.sin_len = sizeof(struct sockaddr_in); sain.sin_addr.s_addr = faddr->s_addr; ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain))); if (ia == NULL) ia = ifatoia(ifa_ifwithnet(sintosa(&sain))); if (cred == NULL || !jailed(cred)) { if (ia == NULL) { error = ENETUNREACH; goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ if (ia != NULL) { struct ifnet *ifp; ifp = ia->ia_ifp; ia = NULL; TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (htonl(prison_getip(cred)) == sin->sin_addr.s_addr) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } } /* 3. As a last resort return the 'default' jail address. */ laddr->s_addr = htonl(prison_getip(cred)); goto done; } done: if (sro.ro_rt != NULL) RTFREE(sro.ro_rt); return (error); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. * * If the operation fails because the connection already exists, * *oinpp will be set to the PCB of that connection so that the * caller can decide to override it. In all other cases, *oinpp * is set to NULL. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { INIT_VNET_INET(inp->inp_vnet); struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct inpcb *oinp; struct in_addr laddr, faddr; u_short lport, fport; int error; /* * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo); INP_LOCK_ASSERT(inp); if (oinpp != NULL) *oinpp = NULL; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; if (!TAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) faddr = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; else if (faddr.s_addr == (u_long)INADDR_BROADCAST && (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST)) faddr = satosin(&TAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; } if (laddr.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &faddr, &laddr, cred); if (error) return (error); /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, use the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) if (ia->ia_ifp == ifp) break; if (ia == NULL) return (EADDRNOTAVAIL); laddr = ia->ia_addr.sin_addr; } } } oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; return (EADDRINUSE); } if (lport == 0) { error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, cred); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); } /* * Historically, in_pcbdetach() included the functionality now found in * in_pcbfree() and in_pcbdrop(). They are now broken out to reflect the * more complex life cycle of TCP. * * in_pcbdetach() is responsibe for disconnecting the socket from an inpcb. * For most protocols, this will be invoked immediately prior to calling * in_pcbfree(). However, for TCP the inpcb may significantly outlive the * socket, in which case in_pcbfree() may be deferred. */ void in_pcbdetach(struct inpcb *inp) { KASSERT(inp->inp_socket != NULL, ("in_pcbdetach: inp_socket == NULL")); inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } /* * in_pcbfree() is responsible for freeing an already-detached inpcb, as well * as removing it from any global inpcb lists it might be on. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *ipi = inp->inp_pcbinfo; KASSERT(inp->inp_socket == NULL, ("in_pcbfree: inp_socket != NULL")); INP_INFO_WLOCK_ASSERT(ipi); INP_WLOCK_ASSERT(inp); #ifdef IPSEC ipsec4_delete_pcbpolicy(inp); #endif /*IPSEC*/ inp->inp_gencnt = ++ipi->ipi_gencnt; in_pcbremlists(inp); if (inp->inp_options) (void)m_free(inp->inp_options); if (inp->inp_moptions != NULL) inp_freemoptions(inp->inp_moptions); inp->inp_vflag = 0; crfree(inp->inp_cred); #ifdef MAC mac_inpcb_destroy(inp); #endif INP_WUNLOCK(inp); uma_zfree(ipi->ipi_zone, inp); } /* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * * It is used by TCP to mark an inpcb as unused and avoid future packet * delivery or event notification when a socket remains open but TCP has * closed. This might occur as a result of a shutdown()-initiated TCP close * or a RST on the wire, and allows the port binding to be reused while still * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * * XXXRW: An inp_lport of 0 is used to indicate that the inpcb is not on hash * lists, but can lead to confusing netstat output, as open sockets with * closed TCP connections will no longer appear to have their bound port * number. An explicit flag would be better, as it would allow us to leave * the port number intact after the connection is dropped. * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbnotifyall() and in_pcbpurgeif0()? */ void in_pcbdrop(struct inpcb *inp) { INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); inp->inp_vflag |= INP_DROPPED; if (inp->inp_lport) { struct inpcbport *phd = inp->inp_phd; LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } inp->inp_lport = 0; } } /* * Common routines to return the socket addresses associated with inpcbs. */ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; sin = malloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } int in_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->inp_laddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } int in_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->inp_faddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { INP_WUNLOCK(inp); continue; } #endif if (inp->inp_faddr.s_addr != faddr.s_addr || inp->inp_socket == NULL) { INP_WUNLOCK(inp); continue; } if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *inp; struct ip_moptions *imo; int i, gap; INP_INFO_RLOCK(pcbinfo); LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; if ((inp->inp_vflag & INP_IPV4) && imo != NULL) { /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. */ for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) { if (imo->imo_membership[i]->inm_ifp == ifp) { in_delmulti(imo->imo_membership[i]); gap++; } else if (gap != 0) imo->imo_membership[i - gap] = imo->imo_membership[i]; } imo->imo_num_memberships -= gap; } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, int wild_okay, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; INP_INFO_LOCK_ASSERT(pcbinfo); if (!wild_okay) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found. */ return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) { break; } } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST /* * Lookup PCB in hash list. */ struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp; u_short fport = fport_arg, lport = lport_arg; INP_INFO_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) return (inp); } /* * Then look for a wildcard match, if requested. */ if (wildcard) { struct inpcb *local_wild = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_lport == lport) { if (ifp && ifp->if_type == IFT_FAITH && (inp->inp_flags & INP_FAITH) == 0) continue; if (inp->inp_laddr.s_addr == laddr.s_addr) return (inp); else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 if (INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) local_wild_mapped = inp; else #endif local_wild = inp; } } } #ifdef INET6 if (local_wild == NULL) return (local_wild_mapped); #endif return (local_wild); } return (NULL); } /* * Insert PCB onto various hash lists. */ int in_pcbinshash(struct inpcb *inp) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; else #endif /* INET6 */ hashkey_faddr = inp->inp_faddr.s_addr; pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Go through port list and look for a head for this lport. */ LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); if (phd == NULL) { return (ENOBUFS); /* XXX */ } phd->phd_port = inp->inp_lport; LIST_INIT(&phd->phd_pcblist); LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); LIST_INSERT_HEAD(pcbhash, inp, inp_hash); return (0); } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void in_pcbrehash(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; else #endif /* INET6 */ hashkey_faddr = inp->inp_faddr.s_addr; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); } /* * Remove PCB from various lists. */ void in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_lport) { struct inpcbport *phd = inp->inp_phd; LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } } LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); INP_WUNLOCK(inp); #endif } /* * ipport_tick runs once per second, determining if random port allocation * should be continued. If more than ipport_randomcps ports have been * allocated in the last second, then we return to sequential port * allocation. We return to random allocation only once we drop below * ipport_randomcps for at least ipport_randomtime seconds. */ void ipport_tick(void *xtp) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ INIT_VNET_INET(vnet_iter); if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) { if (V_ipport_stoprandom > 0) V_ipport_stoprandom--; } else V_ipport_stoprandom = V_ipport_randomtime; V_ipport_tcplastcount = V_ipport_tcpallocs; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } void inp_wlock(struct inpcb *inp) { INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { INP_RLOCK(inp); } void inp_runlock(struct inpcb *inp) { INP_RUNLOCK(inp); } #ifdef INVARIANTS void inp_lock_assert(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { INIT_VNET_INET(curvnet); struct inpcb *inp; INP_INFO_RLOCK(&V_tcbinfo); LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_tcbinfo); } struct socket * inp_inpcbtosocket(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return (inp->inp_socket); } struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return ((struct tcpcb *)inp->inp_ppcb); } int inp_ip_tos_get(const struct inpcb *inp) { return (inp->inp_ip_tos); } void inp_ip_tos_set(struct inpcb *inp, int val) { inp->inp_ip_tos = val; } void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp) { INP_LOCK_ASSERT(inp); *laddr = inp->inp_laddr.s_addr; *faddr = inp->inp_faddr.s_addr; *lp = inp->inp_lport; *fp = inp->inp_fport; } struct inpcb * so_sotoinpcb(struct socket *so) { return (sotoinpcb(so)); } struct tcpcb * so_sototcpcb(struct socket *so) { return (sototcpcb(so)); } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags == 1) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else { #endif /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); #ifdef INET6 } #endif db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_FAITH) { db_printf("%sINP_FAITH", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_TIMEWAIT) { db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } } void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); db_print_indent(indent); db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif Index: head/sys/netinet/in_pcb.h =================================================================== --- head/sys/netinet/in_pcb.h (revision 185087) +++ head/sys/netinet/in_pcb.h (revision 185088) @@ -1,496 +1,498 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_IN_PCB_H_ #define _NETINET_IN_PCB_H_ #include #include #include #include #include #ifdef _KERNEL #include #endif #define in6pcb inpcb /* for KAME src sync over BSD*'s */ #define in6p_sp inp_sp /* for KAME src sync over BSD*'s */ struct inpcbpolicy; /* * struct inpcb is the common protocol control block structure used in most * IP transport protocols. * * Pointers to local and foreign host table entries, local and foreign socket * numbers, and pointers up (to a socket structure) and down (to a * protocol-specific control block) are stored here. */ LIST_HEAD(inpcbhead, inpcb); LIST_HEAD(inpcbporthead, inpcbport); typedef u_quad_t inp_gen_t; /* * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet. * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing * the following structure. */ struct in_addr_4in6 { u_int32_t ia46_pad32[3]; struct in_addr ia46_addr4; }; /* * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has * some extra padding to accomplish this. */ struct in_endpoints { u_int16_t ie_fport; /* foreign port */ u_int16_t ie_lport; /* local port */ /* protocol dependent part, local and foreign addr */ union { /* foreign host table entry */ struct in_addr_4in6 ie46_foreign; struct in6_addr ie6_foreign; } ie_dependfaddr; union { /* local host table entry */ struct in_addr_4in6 ie46_local; struct in6_addr ie6_local; } ie_dependladdr; #define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4 #define ie_laddr ie_dependladdr.ie46_local.ia46_addr4 #define ie6_faddr ie_dependfaddr.ie6_foreign #define ie6_laddr ie_dependladdr.ie6_local }; /* * XXX The defines for inc_* are hacks and should be changed to direct * references. */ struct in_conninfo { u_int8_t inc_flags; u_int8_t inc_len; u_int16_t inc_fibnum; /* XXX was pad, 16 bits is plenty */ /* protocol dependent part */ struct in_endpoints inc_ie; }; #define inc_isipv6 inc_flags /* temp compatability */ #define inc_fport inc_ie.ie_fport #define inc_lport inc_ie.ie_lport #define inc_faddr inc_ie.ie_faddr #define inc_laddr inc_ie.ie_laddr #define inc6_faddr inc_ie.ie6_faddr #define inc6_laddr inc_ie.ie6_laddr struct icmp6_filter; /*- * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 * and IPv6 sockets. In the case of TCP, further per-connection state is * hung off of inp_ppcb most of the time. Almost all fields of struct inpcb * are static after creation or protected by a per-inpcb rwlock, inp_lock. A * few fields also require the global pcbinfo lock for the inpcb to be held, * when modified, such as the global connection lists and hashes, as well as * binding information (which affects which hash a connection is on). This * model means that connections can be looked up without holding the * per-connection lock, which is important for performance when attempting to * find the connection for a packet given its IP and port tuple. Writing to * these fields that write locks be held on both the inpcb and global locks. * * Key: * (c) - Constant after initialization * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb * (s) - Protected by another subsystem's locks * (x) - Undefined locking * * A few other notes: * * When a read lock is held, stability of the field is guaranteed; to write * to a field, a write lock must generally be held. * * netinet/netinet6-layer code should not assume that the inp_socket pointer * is safe to dereference without inp_lock being held, even for protocols * other than TCP (where the inpcb persists during TIMEWAIT even after the * socket has been freed), or there may be close(2)-related races. * * The inp_vflag field is overloaded, and would otherwise ideally be (c). */ struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */ LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ struct socket *inp_socket; /* (i) back pointer to socket */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ int inp_flags; /* (i) generic IP/datagram flags */ u_char inp_vflag; /* (i) IP version flag (v4/v6) */ #define INP_IPV4 0x1 #define INP_IPV6 0x2 #define INP_IPV6PROTO 0x4 /* opened under IPv6 protocol */ #define INP_TIMEWAIT 0x8 /* .. probably doesn't go here */ #define INP_ONESBCAST 0x10 /* send all-ones broadcast */ #define INP_DROPPED 0x20 /* protocol drop flag */ #define INP_SOCKREF 0x40 /* strong socket reference */ u_char inp_ip_ttl; /* (i) time to live proto */ u_char inp_ip_p; /* (c) protocol proto */ u_char inp_ip_minttl; /* (i) minimum TTL or drop */ uint32_t inp_ispare1; /* (x) connection id / queue id */ void *inp_pspare[2]; /* (x) rtentry / general use */ /* Local and foreign ports, local and foreign addr. */ struct in_conninfo inp_inc; /* (i/p) list for PCB's local port */ struct label *inp_label; /* (i) MAC label */ struct inpcbpolicy *inp_sp; /* (s) for IPSEC */ /* Protocol-dependent part; options. */ struct { u_char inp4_ip_tos; /* (i) type of service proto */ struct mbuf *inp4_options; /* (i) IP options */ struct ip_moptions *inp4_moptions; /* (i) IP multicast options */ } inp_depend4; #define inp_fport inp_inc.inc_fport #define inp_lport inp_inc.inc_lport #define inp_faddr inp_inc.inc_faddr #define inp_laddr inp_inc.inc_laddr #define inp_ip_tos inp_depend4.inp4_ip_tos #define inp_options inp_depend4.inp4_options #define inp_moptions inp_depend4.inp4_moptions struct { /* (i) IP options */ struct mbuf *inp6_options; /* (i) IP6 options for outgoing packets */ struct ip6_pktopts *inp6_outputopts; /* (i) IP multicast options */ struct ip6_moptions *inp6_moptions; /* (i) ICMPv6 code type filter */ struct icmp6_filter *inp6_icmp6filt; /* (i) IPV6_CHECKSUM setsockopt */ int inp6_cksum; short inp6_hops; } inp_depend6; LIST_ENTRY(inpcb) inp_portlist; /* (i/p) */ struct inpcbport *inp_phd; /* (i/p) head of this list */ #define inp_zero_size offsetof(struct inpcb, inp_gencnt) inp_gen_t inp_gencnt; /* (c) generation count of this instance */ struct rwlock inp_lock; #define in6p_faddr inp_inc.inc6_faddr #define in6p_laddr inp_inc.inc6_laddr #define in6p_hops inp_depend6.inp6_hops /* default hop limit */ #define in6p_ip6_nxt inp_ip_p #define in6p_flowinfo inp_flow #define in6p_vflag inp_vflag #define in6p_options inp_depend6.inp6_options #define in6p_outputopts inp_depend6.inp6_outputopts #define in6p_moptions inp_depend6.inp6_moptions #define in6p_icmp6filt inp_depend6.inp6_icmp6filt #define in6p_cksum inp_depend6.inp6_cksum #define in6p_flags inp_flags /* for KAME src sync over BSD*'s */ #define in6p_socket inp_socket /* for KAME src sync over BSD*'s */ #define in6p_lport inp_lport /* for KAME src sync over BSD*'s */ #define in6p_fport inp_fport /* for KAME src sync over BSD*'s */ #define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ }; /* * The range of the generation count, as used in this implementation, is 9e19. * We would have to create 300 billion connections per second for this number * to roll over in a year. This seems sufficiently unlikely that we simply * don't concern ourselves with that possibility. */ /* * Interface exported to userland by various protocols which use inpcbs. Hack * alert -- only define if struct xsocket is in scope. */ #ifdef _SYS_SOCKETVAR_H_ struct xinpcb { size_t xi_len; /* length of this structure */ struct inpcb xi_inp; struct xsocket xi_socket; u_quad_t xi_alignment_hack; }; struct xinpgen { size_t xig_len; /* length of this structure */ u_int xig_count; /* number of PCBs at this time */ inp_gen_t xig_gen; /* generation count at this time */ so_gen_t xig_sogen; /* socket generation count at this time */ }; #endif /* _SYS_SOCKETVAR_H_ */ struct inpcbport { LIST_ENTRY(inpcbport) phd_hash; struct inpcbhead phd_pcblist; u_short phd_port; }; /* * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. */ struct inpcbinfo { /* * Global list of inpcbs on the protocol. */ struct inpcbhead *ipi_listhead; u_int ipi_count; /* * Global hash of inpcbs, hashed by local and foreign addresses and * port numbers. */ struct inpcbhead *ipi_hashbase; u_long ipi_hashmask; /* * Global hash of inpcbs, hashed by only local port number. */ struct inpcbporthead *ipi_porthashbase; u_long ipi_porthashmask; /* * Fields associated with port lookup and allocation. */ u_short ipi_lastport; u_short ipi_lastlow; u_short ipi_lasthi; /* * UMA zone from which inpcbs are allocated for this protocol. */ struct uma_zone *ipi_zone; /* * Generation count--incremented each time a connection is allocated * or freed. */ u_quad_t ipi_gencnt; struct rwlock ipi_lock; /* * vimage 1 * general use 1 */ void *ipi_pspare[2]; }; #define INP_LOCK_INIT(inp, d, t) \ rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) #define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock) #define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock) #define INP_TRY_RLOCK(inp) rw_try_rlock(&(inp)->inp_lock) #define INP_TRY_WLOCK(inp) rw_try_wlock(&(inp)->inp_lock) #define INP_RUNLOCK(inp) rw_runlock(&(inp)->inp_lock) #define INP_WUNLOCK(inp) rw_wunlock(&(inp)->inp_lock) #define INP_LOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_LOCKED) #define INP_RLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_RLOCKED) #define INP_WLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_WLOCKED) #define INP_UNLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_UNLOCKED) #ifdef _KERNEL /* * These locking functions are for inpcb consumers outside of sys/netinet, * more specifically, they were added for the benefit of TOE drivers. The * macros are reserved for use by the stack. */ void inp_wlock(struct inpcb *); void inp_wunlock(struct inpcb *); void inp_rlock(struct inpcb *); void inp_runlock(struct inpcb *); #ifdef INVARIANTS void inp_lock_assert(struct inpcb *); void inp_unlock_assert(struct inpcb *); #else static __inline void inp_lock_assert(struct inpcb *inp __unused) { } static __inline void inp_unlock_assert(struct inpcb *inp __unused) { } #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg); int inp_ip_tos_get(const struct inpcb *inp); void inp_ip_tos_set(struct inpcb *inp, int val); struct socket * inp_inpcbtosocket(struct inpcb *inp); struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp); void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp); #endif /* _KERNEL */ #define INP_INFO_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE) #define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock) #define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock) #define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock) #define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock) #define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock) #define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock) #define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock) #define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED) #define INP_INFO_RLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_RLOCKED) #define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) #define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) #define INP_PCBHASH(faddr, lport, fport, mask) \ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ (ntohs((lport)) & (mask)) /* flags in inp_flags: */ #define INP_RECVOPTS 0x01 /* receive incoming IP options */ #define INP_RECVRETOPTS 0x02 /* receive IP options for reply */ #define INP_RECVDSTADDR 0x04 /* receive IP dst address */ #define INP_HDRINCL 0x08 /* user supplies entire IP header */ #define INP_HIGHPORT 0x10 /* user wants "high" port binding */ #define INP_LOWPORT 0x20 /* user wants "low" port binding */ #define INP_ANONPORT 0x40 /* port chosen for user */ #define INP_RECVIF 0x80 /* receive incoming interface */ #define INP_MTUDISC 0x100 /* user can do MTU discovery */ #define INP_FAITH 0x200 /* accept FAITH'ed connections */ #define INP_RECVTTL 0x400 /* receive incoming IP TTL */ #define INP_DONTFRAG 0x800 /* don't fragment packet */ #define IN6P_IPV6_V6ONLY 0x008000 /* restrict AF_INET6 socket for v6 */ #define IN6P_PKTINFO 0x010000 /* receive IP6 dst and I/F */ #define IN6P_HOPLIMIT 0x020000 /* receive hoplimit */ #define IN6P_HOPOPTS 0x040000 /* receive hop-by-hop options */ #define IN6P_DSTOPTS 0x080000 /* receive dst options after rthdr */ #define IN6P_RTHDR 0x100000 /* receive routing header */ #define IN6P_RTHDRDSTOPTS 0x200000 /* receive dstoptions before rthdr */ #define IN6P_TCLASS 0x400000 /* receive traffic class value */ #define IN6P_AUTOFLOWLABEL 0x800000 /* attach flowlabel automatically */ #define IN6P_RFC2292 0x40000000 /* used RFC2292 API on the socket */ #define IN6P_MTU 0x80000000 /* receive path MTU */ #define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ INP_RECVIF|INP_RECVTTL|\ IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\ IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\ IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\ IN6P_MTU) #define INP_UNMAPPABLEOPTS (IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR|\ IN6P_TCLASS|IN6P_AUTOFLOWLABEL) /* for KAME src sync over BSD*'s */ #define IN6P_HIGHPORT INP_HIGHPORT #define IN6P_LOWPORT INP_LOWPORT #define IN6P_ANONPORT INP_ANONPORT #define IN6P_RECVIF INP_RECVIF #define IN6P_MTUDISC INP_MTUDISC #define IN6P_FAITH INP_FAITH #define IN6P_CONTROLOPTS INP_CONTROLOPTS /* * socket AF version is {newer than,or include} * actual datagram AF version */ #define INPLOOKUP_WILDCARD 1 #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ #define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) #ifdef _KERNEL extern int ipport_reservedhigh; extern int ipport_reservedlow; extern int ipport_lowfirstauto; extern int ipport_lowlastauto; extern int ipport_firstauto; extern int ipport_lastauto; extern int ipport_hifirstauto; extern int ipport_hilastauto; extern int ipport_randomized; +extern int ipport_randomcps; +extern int ipport_randomtime; extern int ipport_stoprandom; extern int ipport_tcpallocs; extern struct callout ipport_tick_callout; void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *); int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, struct ucred *); int in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *); int in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, in_addr_t *, u_short *, struct inpcb **, struct ucred *); void in_pcbdetach(struct inpcb *); void in_pcbdisconnect(struct inpcb *); void in_pcbdrop(struct inpcb *); void in_pcbfree(struct inpcb *); int in_pcbinshash(struct inpcb *); struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *); struct inpcb * in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, int, struct inpcb *(*)(struct inpcb *, int)); void in_pcbrehash(struct inpcb *); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *so, struct sockaddr **nam); int in_getsockaddr(struct socket *so, struct sockaddr **nam); struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr); void in_pcbsosetlabel(struct socket *so); void in_pcbremlists(struct inpcb *inp); void ipport_tick(void *xtp); /* * Debugging routines compiled in when DDB is present. */ void db_print_inpcb(struct inpcb *inp, const char *name, int indent); #endif /* _KERNEL */ #endif /* !_NETINET_IN_PCB_H_ */ Index: head/sys/netinet/in_proto.c =================================================================== --- head/sys/netinet/in_proto.c (revision 185087) +++ head/sys/netinet/in_proto.c (revision 185088) @@ -1,395 +1,396 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_proto.c 8.2 (Berkeley) 2/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipx.h" #include "opt_mrouting.h" #include "opt_ipsec.h" #include "opt_inet6.h" #include "opt_pf.h" #include "opt_carp.h" #include "opt_sctp.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include #include /* * TCP/IP protocol family: IP, ICMP, UDP, TCP. */ static struct pr_usrreqs nousrreqs; #ifdef IPSEC #include #endif /* IPSEC */ #ifdef SCTP #include #include #include #include #endif /* SCTP */ #ifdef DEV_PFSYNC #include #include #endif #ifdef DEV_CARP #include #endif extern struct domain inetdomain; /* Spacer for loadable protocols. */ #define IPPROTOSPACER \ { \ .pr_domain = &inetdomain, \ .pr_protocol = PROTO_SPACER, \ .pr_usrreqs = &nousrreqs \ } struct protosw inetsw[] = { { .pr_type = 0, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IP, .pr_init = ip_init, .pr_slowtimo = ip_slowtimo, .pr_drain = ip_drain, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_DGRAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_UDP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = udp_input, .pr_ctlinput = udp_ctlinput, .pr_ctloutput = ip_ctloutput, .pr_init = udp_init, .pr_usrreqs = &udp_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, .pr_input = tcp_input, .pr_ctlinput = tcp_ctlinput, .pr_ctloutput = tcp_ctloutput, .pr_init = tcp_init, .pr_slowtimo = tcp_slowtimo, .pr_drain = tcp_drain, .pr_usrreqs = &tcp_usrreqs }, #ifdef SCTP { .pr_type = SOCK_DGRAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD, .pr_input = sctp_input, .pr_ctlinput = sctp_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_init = sctp_init, .pr_drain = sctp_drain, .pr_usrreqs = &sctp_usrreqs }, { .pr_type = SOCK_SEQPACKET, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD, .pr_input = sctp_input, .pr_ctlinput = sctp_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_drain = sctp_drain, .pr_usrreqs = &sctp_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD, .pr_input = sctp_input, .pr_ctlinput = sctp_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_drain = sctp_drain, .pr_usrreqs = &sctp_usrreqs }, #endif /* SCTP */ { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_RAW, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip_input, .pr_ctlinput = rip_ctlinput, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_ICMP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = icmp_input, .pr_ctloutput = rip_ctloutput, + .pr_init = icmp_init, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IGMP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = igmp_input, .pr_ctloutput = rip_ctloutput, .pr_init = igmp_init, .pr_fasttimo = igmp_fasttimo, .pr_slowtimo = igmp_slowtimo, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_RSVP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = rsvp_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, #ifdef IPSEC { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_AH, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ah4_input, .pr_ctlinput = ah4_ctlinput, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_ESP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = esp4_input, .pr_ctlinput = esp4_ctlinput, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPCOMP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipcomp4_input, .pr_usrreqs = &nousrreqs }, #endif /* IPSEC */ { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV4, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_init = encap_init, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_MOBILE, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_init = encap_init, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_ETHERIP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_init = encap_init, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_GRE, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_init = encap_init, .pr_usrreqs = &rip_usrreqs }, # ifdef INET6 { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_init = encap_init, .pr_usrreqs = &rip_usrreqs }, #endif { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, #ifdef DEV_PFSYNC { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PFSYNC, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = pfsync_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, #endif /* DEV_PFSYNC */ #ifdef DEV_CARP { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_CARP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = carp_input, .pr_output = (pr_output_t*)rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, #endif /* DEV_CARP */ /* Spacer n-times for loadable protocols. */ IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, /* raw wildcard */ { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip_input, .pr_ctloutput = rip_ctloutput, .pr_init = rip_init, .pr_usrreqs = &rip_usrreqs }, }; extern int in_inithead(void **, int); struct domain inetdomain = { .dom_family = AF_INET, .dom_name = "internet", .dom_protosw = inetsw, .dom_protoswNPROTOSW = &inetsw[sizeof(inetsw)/sizeof(inetsw[0])], #ifdef RADIX_MPATH .dom_rtattach = rn4_mpath_inithead, #else .dom_rtattach = in_inithead, #endif .dom_rtoffset = 32, .dom_maxrtkey = sizeof(struct sockaddr_in) }; DOMAIN_SET(inet); SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW, 0, "Internet Family"); SYSCTL_NODE(_net_inet, IPPROTO_IP, ip, CTLFLAG_RW, 0, "IP"); SYSCTL_NODE(_net_inet, IPPROTO_ICMP, icmp, CTLFLAG_RW, 0, "ICMP"); SYSCTL_NODE(_net_inet, IPPROTO_UDP, udp, CTLFLAG_RW, 0, "UDP"); SYSCTL_NODE(_net_inet, IPPROTO_TCP, tcp, CTLFLAG_RW, 0, "TCP"); #ifdef SCTP SYSCTL_NODE(_net_inet, IPPROTO_SCTP, sctp, CTLFLAG_RW, 0, "SCTP"); #endif SYSCTL_NODE(_net_inet, IPPROTO_IGMP, igmp, CTLFLAG_RW, 0, "IGMP"); #ifdef IPSEC /* XXX no protocol # to use, pick something "reserved" */ SYSCTL_NODE(_net_inet, 253, ipsec, CTLFLAG_RW, 0, "IPSEC"); SYSCTL_NODE(_net_inet, IPPROTO_AH, ah, CTLFLAG_RW, 0, "AH"); SYSCTL_NODE(_net_inet, IPPROTO_ESP, esp, CTLFLAG_RW, 0, "ESP"); SYSCTL_NODE(_net_inet, IPPROTO_IPCOMP, ipcomp, CTLFLAG_RW, 0, "IPCOMP"); SYSCTL_NODE(_net_inet, IPPROTO_IPIP, ipip, CTLFLAG_RW, 0, "IPIP"); #endif /* IPSEC */ SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW, 0, "RAW"); #ifdef DEV_PFSYNC SYSCTL_NODE(_net_inet, IPPROTO_PFSYNC, pfsync, CTLFLAG_RW, 0, "PFSYNC"); #endif #ifdef DEV_CARP SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP"); #endif Index: head/sys/netinet/in_rmx.c =================================================================== --- head/sys/netinet/in_rmx.c (revision 185087) +++ head/sys/netinet/in_rmx.c (revision 185088) @@ -1,505 +1,515 @@ /*- * Copyright 1994, 1995 Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This code does two things necessary for the enhanced TCP metrics to * function in a useful manner: * 1) It marks all non-host routes as `cloning', thus ensuring that * every actual reference to such a route actually gets turned * into a reference to a host route to the specific destination * requested. * 2) When such routes lose all their references, it arranges for them * to be deleted in some random collection of circumstances, so that * a large quantity of stale routing data is not kept in kernel memory * indefinitely. See in_rtqtimo() below for the exact mechanism. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int in_inithead(void **head, int off); #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ /* * Do what we need to do when inserting a route. */ static struct radix_node * in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, struct radix_node *treenodes) { struct rtentry *rt = (struct rtentry *)treenodes; struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt); struct radix_node *ret; /* * A little bit of help for both IP output and input: * For host routes, we make sure that RTF_BROADCAST * is set for anything that looks like a broadcast address. * This way, we can avoid an expensive call to in_broadcast() * in ip_output() most of the time (because the route passed * to ip_output() is almost always a host route). * * We also do the same for local addresses, with the thought * that this might one day be used to speed up ip_input(). * * We also mark routes to multicast addresses as such, because * it's easy to do and might be useful (but this is much more * dubious since it's so easy to inspect the address). */ if (rt->rt_flags & RTF_HOST) { if (in_broadcast(sin->sin_addr, rt->rt_ifp)) { rt->rt_flags |= RTF_BROADCAST; } else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr == sin->sin_addr.s_addr) { rt->rt_flags |= RTF_LOCAL; } } if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) rt->rt_flags |= RTF_MULTICAST; if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp) rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; ret = rn_addroute(v_arg, n_arg, head, treenodes); if (ret == NULL && rt->rt_flags & RTF_HOST) { struct rtentry *rt2; /* * We are trying to add a host route, but can't. * Find out if it is because of an * ARP entry and delete it if so. */ rt2 = in_rtalloc1((struct sockaddr *)sin, 0, RTF_CLONING, rt->rt_fibnum); if (rt2) { if (rt2->rt_flags & RTF_LLINFO && rt2->rt_flags & RTF_HOST && rt2->rt_gateway && rt2->rt_gateway->sa_family == AF_LINK) { rtexpunge(rt2); RTFREE_LOCKED(rt2); ret = rn_addroute(v_arg, n_arg, head, treenodes); } else RTFREE_LOCKED(rt2); } } return ret; } /* * This code is the inverse of in_clsroute: on first reference, if we * were managing the route, stop doing so and set the expiration timer * back off again. */ static struct radix_node * in_matroute(void *v_arg, struct radix_node_head *head) { struct radix_node *rn = rn_match(v_arg, head); struct rtentry *rt = (struct rtentry *)rn; /*XXX locking? */ if (rt && rt->rt_refcnt == 0) { /* this is first reference */ if (rt->rt_flags & RTPRF_OURS) { rt->rt_flags &= ~RTPRF_OURS; rt->rt_rmx.rmx_expire = 0; } } return rn; } -static int rtq_reallyold = 60*60; /* one hour is "really old" */ +#ifdef VIMAGE_GLOBALS +static int rtq_reallyold; +static int rtq_minreallyold; +static int rtq_toomany; +#endif + SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW, rtq_reallyold, 0, "Default expiration time on dynamically learned routes"); -static int rtq_minreallyold = 10; /* never automatically crank down to less */ SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, rtq_minreallyold, 0, "Minimum time to attempt to hold onto dynamically learned routes"); -static int rtq_toomany = 128; /* 128 cached routes is "too many" */ SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, rtq_toomany, 0, "Upper limit on dynamically learned routes"); /* * On last reference drop, mark the route as belong to us so that it can be * timed out. */ static void in_clsroute(struct radix_node *rn, struct radix_node_head *head) { INIT_VNET_INET(curvnet); struct rtentry *rt = (struct rtentry *)rn; RT_LOCK_ASSERT(rt); if (!(rt->rt_flags & RTF_UP)) return; /* prophylactic measures */ if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST) return; if (rt->rt_flags & RTPRF_OURS) return; if (!(rt->rt_flags & (RTF_WASCLONED | RTF_DYNAMIC))) return; /* * If rtq_reallyold is 0, just delete the route without * waiting for a timeout cycle to kill it. */ if (V_rtq_reallyold != 0) { rt->rt_flags |= RTPRF_OURS; rt->rt_rmx.rmx_expire = time_uptime + V_rtq_reallyold; } else { rtexpunge(rt); } } struct rtqk_arg { struct radix_node_head *rnh; int draining; int killed; int found; int updating; time_t nextstop; }; /* * Get rid of old routes. When draining, this deletes everything, even when * the timeout is not expired yet. When updating, this makes sure that * nothing has a timeout longer than the current value of rtq_reallyold. */ static int in_rtqkill(struct radix_node *rn, void *rock) { INIT_VNET_INET(curvnet); struct rtqk_arg *ap = rock; struct rtentry *rt = (struct rtentry *)rn; int err; if (rt->rt_flags & RTPRF_OURS) { ap->found++; if (ap->draining || rt->rt_rmx.rmx_expire <= time_uptime) { if (rt->rt_refcnt > 0) panic("rtqkill route really not free"); err = in_rtrequest(RTM_DELETE, (struct sockaddr *)rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0, rt->rt_fibnum); if (err) { log(LOG_WARNING, "in_rtqkill: error %d\n", err); } else { ap->killed++; } } else { if (ap->updating && (rt->rt_rmx.rmx_expire - time_uptime > V_rtq_reallyold)) { rt->rt_rmx.rmx_expire = time_uptime + V_rtq_reallyold; } ap->nextstop = lmin(ap->nextstop, rt->rt_rmx.rmx_expire); } } return 0; } #define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ -static int rtq_timeout = RTQ_TIMEOUT; +#ifdef VIMAGE_GLOBALS +static int rtq_timeout; static struct callout rtq_timer; +#endif static void in_rtqtimo_one(void *rock); static void in_rtqtimo(void *rock) { int fibnum; void *newrock; struct timeval atv; KASSERT((rock == (void *)V_rt_tables[0][AF_INET]), ("in_rtqtimo: unexpected arg")); for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { if ((newrock = V_rt_tables[fibnum][AF_INET]) != NULL) in_rtqtimo_one(newrock); } atv.tv_usec = 0; atv.tv_sec = V_rtq_timeout; callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock); } static void in_rtqtimo_one(void *rock) { struct radix_node_head *rnh = rock; struct rtqk_arg arg; static time_t last_adjusted_timeout = 0; arg.found = arg.killed = 0; arg.rnh = rnh; arg.nextstop = time_uptime + V_rtq_timeout; arg.draining = arg.updating = 0; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in_rtqkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); /* * Attempt to be somewhat dynamic about this: * If there are ``too many'' routes sitting around taking up space, * then crank down the timeout, and see if we can't make some more * go away. However, we make sure that we will never adjust more * than once in rtq_timeout seconds, to keep from cranking down too * hard. */ if ((arg.found - arg.killed > V_rtq_toomany) && (time_uptime - last_adjusted_timeout >= V_rtq_timeout) && V_rtq_reallyold > V_rtq_minreallyold) { V_rtq_reallyold = 2 * V_rtq_reallyold / 3; if (V_rtq_reallyold < V_rtq_minreallyold) { V_rtq_reallyold = V_rtq_minreallyold; } last_adjusted_timeout = time_uptime; #ifdef DIAGNOSTIC log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", V_rtq_reallyold); #endif arg.found = arg.killed = 0; arg.updating = 1; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in_rtqkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); } } void in_rtqdrain(void) { VNET_ITERATOR_DECL(vnet_iter); struct radix_node_head *rnh; struct rtqk_arg arg; int fibnum; VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INIT_VNET_NET(vnet_iter); for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { rnh = V_rt_tables[fibnum][AF_INET]; arg.found = arg.killed = 0; arg.rnh = rnh; arg.nextstop = 0; arg.draining = 1; arg.updating = 0; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in_rtqkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); } static int _in_rt_was_here; /* * Initialize our routing tree. */ int in_inithead(void **head, int off) { INIT_VNET_INET(curvnet); struct radix_node_head *rnh; /* XXX MRT * This can be called from vfs_export.c too in which case 'off' * will be 0. We know the correct value so just use that and * return directly if it was 0. * This is a hack that replaces an even worse hack on a bad hack * on a bad design. After RELENG_7 this should be fixed but that * will change the ABI, so for now do it this way. */ if (!rn_inithead(head, 32)) return 0; if (off == 0) /* XXX MRT see above */ return 1; /* only do the rest for a real routing table */ + + V_rtq_reallyold = 60*60; /* one hour is "really old" */ + V_rtq_minreallyold = 10; /* never automatically crank down to less */ + V_rtq_toomany = 128; /* 128 cached routes is "too many" */ + V_rtq_timeout = RTQ_TIMEOUT; rnh = *head; rnh->rnh_addaddr = in_addroute; rnh->rnh_matchaddr = in_matroute; rnh->rnh_close = in_clsroute; if (_in_rt_was_here == 0 ) { callout_init(&V_rtq_timer, CALLOUT_MPSAFE); in_rtqtimo(rnh); /* kick off timeout first time */ _in_rt_was_here = 1; } return 1; } /* * This zaps old routes when the interface goes down or interface * address is deleted. In the latter case, it deletes static routes * that point to this address. If we don't do this, we may end up * using the old address in the future. The ones we always want to * get rid of are things like ARP entries, since the user might down * the interface, walk over to a completely different network, and * plug back in. */ struct in_ifadown_arg { struct ifaddr *ifa; int del; }; static int in_ifadownkill(struct radix_node *rn, void *xap) { struct in_ifadown_arg *ap = xap; struct rtentry *rt = (struct rtentry *)rn; RT_LOCK(rt); if (rt->rt_ifa == ap->ifa && (ap->del || !(rt->rt_flags & RTF_STATIC))) { /* * We need to disable the automatic prune that happens * in this case in rtrequest() because it will blow * away the pointers that rn_walktree() needs in order * continue our descent. We will end up deleting all * the routes that rtrequest() would have in any case, * so that behavior is not needed there. */ rt->rt_flags &= ~RTF_CLONING; rtexpunge(rt); } RT_UNLOCK(rt); return 0; } int in_ifadown(struct ifaddr *ifa, int delete) { INIT_VNET_NET(curvnet); struct in_ifadown_arg arg; struct radix_node_head *rnh; int fibnum; if (ifa->ifa_addr->sa_family != AF_INET) return 1; for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { rnh = V_rt_tables[fibnum][AF_INET]; arg.ifa = ifa; arg.del = delete; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in_ifadownkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); ifa->ifa_flags &= ~IFA_ROUTE; /* XXXlocking? */ } return 0; } /* * inet versions of rt functions. These have fib extensions and * for now will just reference the _fib variants. * eventually this order will be reversed, */ void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum) { rtalloc_ign_fib(ro, ignflags, fibnum); } int in_rtrequest( int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt, u_int fibnum) { return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, fibnum)); } struct rtentry * in_rtalloc1(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { return (rtalloc1_fib(dst, report, ignflags, fibnum)); } void in_rtredirect(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src, u_int fibnum) { rtredirect_fib(dst, gateway, netmask, flags, src, fibnum); } void in_rtalloc(struct route *ro, u_int fibnum) { rtalloc_ign_fib(ro, 0UL, fibnum); } #if 0 int in_rt_getifa(struct rt_addrinfo *, u_int fibnum); int in_rtioctl(u_long, caddr_t, u_int); int in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int); #endif Index: head/sys/netinet/in_var.h =================================================================== --- head/sys/netinet/in_var.h (revision 185087) +++ head/sys/netinet/in_var.h (revision 185088) @@ -1,328 +1,337 @@ /*- * Copyright (c) 1985, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_var.h 8.2 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _NETINET_IN_VAR_H_ #define _NETINET_IN_VAR_H_ #include #include /* * Interface address, Internet version. One of these structures * is allocated for each Internet address on an interface. * The ifaddr structure contains the protocol-independent part * of the structure and is assumed to be first. */ struct in_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp #define ia_flags ia_ifa.ifa_flags /* ia_{,sub}net{,mask} in host order */ u_long ia_net; /* network number of interface */ u_long ia_netmask; /* mask of net part */ u_long ia_subnet; /* subnet number, including net */ u_long ia_subnetmask; /* mask of subnet part */ struct in_addr ia_netbroadcast; /* to recognize net broadcasts */ LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */ TAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */ struct sockaddr_in ia_addr; /* reserve space for interface name */ struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ #define ia_broadaddr ia_dstaddr struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ }; struct in_aliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ struct sockaddr_in ifra_addr; struct sockaddr_in ifra_broadaddr; #define ifra_dstaddr ifra_broadaddr struct sockaddr_in ifra_mask; }; /* * Given a pointer to an in_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in. */ #define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr)) #define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr)) #define IN_LNAOF(in, ifa) \ ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) #ifdef _KERNEL extern u_char inetctlerrmap[]; /* * Hash table for IP addresses. */ extern LIST_HEAD(in_ifaddrhashhead, in_ifaddr) *in_ifaddrhashtbl; extern TAILQ_HEAD(in_ifaddrhead, in_ifaddr) in_ifaddrhead; extern u_long in_ifaddrhmask; /* mask for hash table */ #define INADDR_NHASH_LOG2 9 #define INADDR_NHASH (1 << INADDR_NHASH_LOG2) #define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT) #define INADDR_HASH(x) \ (&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask]) /* * Macro for finding the internet address structure (in_ifaddr) * corresponding to one of our IP addresses (in_addr). */ #define INADDR_TO_IFADDR(addr, ia) \ /* struct in_addr addr; */ \ /* struct in_ifaddr *ia; */ \ do { \ \ LIST_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \ if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \ break; \ } while (0) /* * Macro for finding the interface (ifnet structure) corresponding to one * of our IP addresses. */ #define INADDR_TO_IFP(addr, ifp) \ /* struct in_addr addr; */ \ /* struct ifnet *ifp; */ \ { \ struct in_ifaddr *ia; \ \ INADDR_TO_IFADDR(addr, ia); \ (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ } /* * Macro for finding the internet address structure (in_ifaddr) corresponding * to a given interface (ifnet structure). */ #define IFP_TO_IA(ifp, ia) \ /* struct ifnet *ifp; */ \ /* struct in_ifaddr *ia; */ \ { \ for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead); \ (ia) != NULL && (ia)->ia_ifp != (ifp); \ (ia) = TAILQ_NEXT((ia), ia_link)) \ continue; \ } #endif /* + * IP datagram reassembly. + */ +#define IPREASS_NHASH_LOG2 6 +#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) +#define IPREASS_HMASK (IPREASS_NHASH - 1) +#define IPREASS_HASH(x,y) \ + (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) + +/* * This information should be part of the ifnet structure but we don't wish * to change that - as it might break a number of things */ struct router_info { struct ifnet *rti_ifp; int rti_type; /* type of router which is querier on this interface */ int rti_time; /* # of slow timeouts since last old query */ SLIST_ENTRY(router_info) rti_list; #ifdef notyet int rti_timev1; /* IGMPv1 querier present */ int rti_timev2; /* IGMPv2 querier present */ int rti_timer; /* report to general query */ int rti_qrv; /* querier robustness */ #endif }; /* * Internet multicast address structure. There is one of these for each IP * multicast group to which this host belongs on a given network interface. * For every entry on the interface's if_multiaddrs list which represents * an IP multicast group, there is one of these structures. They are also * kept on a system-wide list to make it easier to keep our legacy IGMP code * compatible with the rest of the world (see IN_FIRST_MULTI et al, below). */ struct in_multi { LIST_ENTRY(in_multi) inm_link; /* queue macro glue */ struct in_addr inm_addr; /* IP multicast address, convenience */ struct ifnet *inm_ifp; /* back pointer to ifnet */ struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */ u_int inm_timer; /* IGMP membership report timer */ u_int inm_state; /* state of the membership */ struct router_info *inm_rti; /* router info*/ u_int inm_refcount; /* reference count */ #ifdef notyet /* IGMPv3 source-specific multicast fields */ TAILQ_HEAD(, in_msfentry) inm_msf; /* all active source filters */ TAILQ_HEAD(, in_msfentry) inm_msf_record; /* recorded sources */ TAILQ_HEAD(, in_msfentry) inm_msf_exclude; /* exclude sources */ TAILQ_HEAD(, in_msfentry) inm_msf_include; /* include sources */ /* XXX: should this lot go to the router_info structure? */ /* XXX: can/should these be callouts? */ /* IGMP protocol timers */ int32_t inm_ti_curstate; /* current state timer */ int32_t inm_ti_statechg; /* state change timer */ /* IGMP report timers */ uint16_t inm_rpt_statechg; /* state change report timer */ uint16_t inm_rpt_toxx; /* fmode change report timer */ /* IGMP protocol state */ uint16_t inm_fmode; /* filter mode */ uint32_t inm_recsrc_count; /* # of recorded sources */ uint16_t inm_exclude_sock_count; /* # of exclude-mode sockets */ uint16_t inm_gass_count; /* # of g-a-s queries */ #endif }; #ifdef notyet /* * Internet multicast source filter list. This list is used to store * IP multicast source addresses for each membership on an interface. * TODO: Allocate these structures using UMA. * TODO: Find an easier way of linking the struct into two lists at once. */ struct in_msfentry { TAILQ_ENTRY(in_msfentry) isf_link; /* next filter in all-list */ TAILQ_ENTRY(in_msfentry) isf_next; /* next filter in queue */ struct in_addr isf_addr; /* the address of this source */ uint16_t isf_refcount; /* reference count */ uint16_t isf_reporttag; /* what to report to the IGMP router */ uint16_t isf_rexmit; /* retransmission state/count */ }; #endif #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); SYSCTL_DECL(_net_inet_raw); #endif extern LIST_HEAD(in_multihead, in_multi) in_multihead; /* * Lock macros for IPv4 layer multicast address lists. IPv4 lock goes * before link layer multicast locks in the lock order. In most cases, * consumers of IN_*_MULTI() macros should acquire the locks before * calling them; users of the in_{add,del}multi() functions should not. */ extern struct mtx in_multi_mtx; #define IN_MULTI_LOCK() mtx_lock(&in_multi_mtx) #define IN_MULTI_UNLOCK() mtx_unlock(&in_multi_mtx) #define IN_MULTI_LOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_OWNED) /* * Structure used by macros below to remember position when stepping through * all of the in_multi records. */ struct in_multistep { struct in_multi *i_inm; }; /* * Macro for looking up the in_multi record for a given IP multicast address * on a given interface. If no matching record is found, "inm" is set null. */ #define IN_LOOKUP_MULTI(addr, ifp, inm) \ /* struct in_addr addr; */ \ /* struct ifnet *ifp; */ \ /* struct in_multi *inm; */ \ do { \ struct ifmultiaddr *ifma; \ \ IN_MULTI_LOCK_ASSERT(); \ IF_ADDR_LOCK(ifp); \ TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { \ if (ifma->ifma_addr->sa_family == AF_INET \ && ((struct sockaddr_in *)ifma->ifma_addr)->sin_addr.s_addr == \ (addr).s_addr) \ break; \ } \ (inm) = ifma ? ifma->ifma_protospec : 0; \ IF_ADDR_UNLOCK(ifp); \ } while(0) /* * Macro to step through all of the in_multi records, one at a time. * The current position is remembered in "step", which the caller must * provide. IN_FIRST_MULTI(), below, must be called to initialize "step" * and get the first record. Both macros return a NULL "inm" when there * are no remaining records. */ #define IN_NEXT_MULTI(step, inm) \ /* struct in_multistep step; */ \ /* struct in_multi *inm; */ \ do { \ IN_MULTI_LOCK_ASSERT(); \ if (((inm) = (step).i_inm) != NULL) \ (step).i_inm = LIST_NEXT((step).i_inm, inm_link); \ } while(0) #define IN_FIRST_MULTI(step, inm) \ /* struct in_multistep step; */ \ /* struct in_multi *inm; */ \ do { \ IN_MULTI_LOCK_ASSERT(); \ (step).i_inm = LIST_FIRST(&V_in_multihead); \ IN_NEXT_MULTI((step), (inm)); \ } while(0) struct rtentry; struct route; struct ip_moptions; size_t imo_match_group(struct ip_moptions *, struct ifnet *, struct sockaddr *); struct in_msource *imo_match_source(struct ip_moptions *, size_t, struct sockaddr *); struct in_multi *in_addmulti(struct in_addr *, struct ifnet *); void in_delmulti(struct in_multi *); void in_delmulti_locked(struct in_multi *); int in_control(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); void in_rtqdrain(void); void ip_input(struct mbuf *); int in_ifadown(struct ifaddr *ifa, int); void in_ifscrub(struct ifnet *, struct in_ifaddr *); struct mbuf *ip_fastforward(struct mbuf *); /* XXX */ void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); void in_rtalloc(struct route *ro, u_int fibnum); struct rtentry *in_rtalloc1(struct sockaddr *, int, u_long, u_int); void in_rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, u_int); int in_rtrequest(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int); #if 0 int in_rt_getifa(struct rt_addrinfo *, u_int fibnum); int in_rtioctl(u_long, caddr_t, u_int); int in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int); #endif #endif /* _KERNEL */ /* INET6 stuff */ #include #endif /* _NETINET_IN_VAR_H_ */ Index: head/sys/netinet/ip_divert.c =================================================================== --- head/sys/netinet/ip_divert.c (revision 185087) +++ head/sys/netinet/ip_divert.c (revision 185088) @@ -1,766 +1,768 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #if !defined(KLD_MODULE) #include "opt_inet.h" #include "opt_ipfw.h" #include "opt_mac.h" #ifndef INET #error "IPDIVERT requires INET." #endif #ifndef IPFIREWALL #error "IPDIVERT requires IPFIREWALL" #endif #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Divert sockets */ /* * Allocate enough space to hold a full IP packet */ #define DIVSNDQ (65536 + 100) #define DIVRCVQ (65536 + 100) /* * Divert sockets work in conjunction with ipfw, see the divert(4) * manpage for features. * Internally, packets selected by ipfw in ip_input() or ip_output(), * and never diverted before, are passed to the input queue of the * divert socket with a given 'divert_port' number (as specified in * the matching ipfw rule), and they are tagged with a 16 bit cookie * (representing the rule number of the matching ipfw rule), which * is passed to process reading from the socket. * * Packets written to the divert socket are again tagged with a cookie * (usually the same as above) and a destination address. * If the destination address is INADDR_ANY then the packet is * treated as outgoing and sent to ip_output(), otherwise it is * treated as incoming and sent to ip_input(). * In both cases, the packet is tagged with the cookie. * * On reinjection, processing in ip_input() and ip_output() * will be exactly the same as for the original packet, except that * ipfw processing will start at the rule number after the one * written in the cookie (so, tagging a packet with a cookie of 0 * will cause it to be effectively considered as a standard packet). */ /* Internal variables. */ +#ifdef VIMAGE_GLOBALS static struct inpcbhead divcb; static struct inpcbinfo divcbinfo; +#endif static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ /* * Initialize divert connection block queue. */ static void div_zone_change(void *tag) { uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); } static int div_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "divinp"); return (0); } static void div_inpcb_fini(void *mem, int size) { struct inpcb *inp = mem; INP_LOCK_DESTROY(inp); } void div_init(void) { INIT_VNET_INET(curvnet); INP_INFO_LOCK_INIT(&V_divcbinfo, "div"); LIST_INIT(&V_divcb); V_divcbinfo.ipi_listhead = &V_divcb; /* * XXX We don't use the hash list for divert IP, but it's easier * to allocate a one entry hash list than it is to check all * over the place for hashbase == NULL. */ V_divcbinfo.ipi_hashbase = hashinit(1, M_PCB, &V_divcbinfo.ipi_hashmask); V_divcbinfo.ipi_porthashbase = hashinit(1, M_PCB, &V_divcbinfo.ipi_porthashmask); V_divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb), NULL, NULL, div_inpcb_init, div_inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(divcbinfo.ipi_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, div_zone_change, NULL, EVENTHANDLER_PRI_ANY); } /* * IPPROTO_DIVERT is not in the real IP protocol number space; this * function should never be called. Just in case, drop any packets. */ void div_input(struct mbuf *m, int off) { INIT_VNET_INET(curvnet); V_ipstat.ips_noproto++; m_freem(m); } /* * Divert a packet by passing it up to the divert socket at port 'port'. * * Setup generic address and protocol structures for div_input routine, * then pass them along with mbuf chain. */ static void divert_packet(struct mbuf *m, int incoming) { INIT_VNET_INET(curvnet); struct ip *ip; struct inpcb *inp; struct socket *sa; u_int16_t nport; struct sockaddr_in divsrc; struct m_tag *mtag; mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL); if (mtag == NULL) { printf("%s: no divert tag\n", __func__); m_freem(m); return; } /* Assure header */ if (m->m_len < sizeof(struct ip) && (m = m_pullup(m, sizeof(struct ip))) == 0) return; ip = mtod(m, struct ip *); /* Delayed checksums are currently not compatible with divert. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { ip->ip_len = ntohs(ip->ip_len); in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; ip->ip_len = htons(ip->ip_len); } /* * Record receive interface address, if any. * But only for incoming packets. */ bzero(&divsrc, sizeof(divsrc)); divsrc.sin_len = sizeof(divsrc); divsrc.sin_family = AF_INET; divsrc.sin_port = divert_cookie(mtag); /* record matching rule */ if (incoming) { struct ifaddr *ifa; /* Sanity check */ M_ASSERTPKTHDR(m); /* Find IP address for receive interface */ TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; divsrc.sin_addr = ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; break; } } /* * Record the incoming interface name whenever we have one. */ if (m->m_pkthdr.rcvif) { /* * Hide the actual interface name in there in the * sin_zero array. XXX This needs to be moved to a * different sockaddr type for divert, e.g. * sockaddr_div with multiple fields like * sockaddr_dl. Presently we have only 7 bytes * but that will do for now as most interfaces * are 4 or less + 2 or less bytes for unit. * There is probably a faster way of doing this, * possibly taking it from the sockaddr_dl on the iface. * This solves the problem of a P2P link and a LAN interface * having the same address, which can result in the wrong * interface being assigned to the packet when fed back * into the divert socket. Theoretically if the daemon saves * and re-uses the sockaddr_in as suggested in the man pages, * this iface name will come along for the ride. * (see div_output for the other half of this.) */ strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname, sizeof(divsrc.sin_zero)); } /* Put packet on socket queue, if any */ sa = NULL; nport = htons((u_int16_t)divert_info(mtag)); INP_INFO_RLOCK(&V_divcbinfo); LIST_FOREACH(inp, &V_divcb, inp_list) { /* XXX why does only one socket match? */ if (inp->inp_lport == nport) { INP_RLOCK(inp); sa = inp->inp_socket; SOCKBUF_LOCK(&sa->so_rcv); if (sbappendaddr_locked(&sa->so_rcv, (struct sockaddr *)&divsrc, m, (struct mbuf *)0) == 0) { SOCKBUF_UNLOCK(&sa->so_rcv); sa = NULL; /* force mbuf reclaim below */ } else sorwakeup_locked(sa); INP_RUNLOCK(inp); break; } } INP_INFO_RUNLOCK(&V_divcbinfo); if (sa == NULL) { m_freem(m); V_ipstat.ips_noproto++; V_ipstat.ips_delivered--; } } /* * Deliver packet back into the IP processing machinery. * * If no address specified, or address is 0.0.0.0, send to ip_output(); * otherwise, send to ip_input() and mark as having been received on * the interface with that address. */ static int div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, struct mbuf *control) { INIT_VNET_INET(curvnet); struct m_tag *mtag; struct divert_tag *dt; int error = 0; struct mbuf *options; /* * An mbuf may hasn't come from userland, but we pretend * that it has. */ m->m_pkthdr.rcvif = NULL; m->m_nextpkt = NULL; if (control) m_freem(control); /* XXX */ if ((mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL)) == NULL) { mtag = m_tag_get(PACKET_TAG_DIVERT, sizeof(struct divert_tag), M_NOWAIT | M_ZERO); if (mtag == NULL) { error = ENOBUFS; goto cantsend; } dt = (struct divert_tag *)(mtag+1); m_tag_prepend(m, mtag); } else dt = (struct divert_tag *)(mtag+1); /* Loopback avoidance and state recovery */ if (sin) { int i; dt->cookie = sin->sin_port; /* * Find receive interface with the given name, stuffed * (if it exists) in the sin_zero[] field. * The name is user supplied data so don't trust its size * or that it is zero terminated. */ for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++) ; if ( i > 0 && i < sizeof(sin->sin_zero)) m->m_pkthdr.rcvif = ifunit(sin->sin_zero); } /* Reinject packet into the system as incoming or outgoing */ if (!sin || sin->sin_addr.s_addr == 0) { struct ip *const ip = mtod(m, struct ip *); struct inpcb *inp; dt->info |= IP_FW_DIVERT_OUTPUT_FLAG; INP_INFO_WLOCK(&V_divcbinfo); inp = sotoinpcb(so); INP_RLOCK(inp); /* * Don't allow both user specified and setsockopt options, * and don't allow packet length sizes that will crash */ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { error = EINVAL; INP_RUNLOCK(inp); INP_INFO_WUNLOCK(&V_divcbinfo); m_freem(m); } else { /* Convert fields to host order for ip_output() */ ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); /* Send packet to output processing */ V_ipstat.ips_rawout++; /* XXX */ #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif /* * Get ready to inject the packet into ip_output(). * Just in case socket options were specified on the * divert socket, we duplicate them. This is done * to avoid having to hold the PCB locks over the call * to ip_output(), as doing this results in a number of * lock ordering complexities. * * Note that we set the multicast options argument for * ip_output() to NULL since it should be invariant that * they are not present. */ KASSERT(inp->inp_moptions == NULL, ("multicast options set on a divert socket")); options = NULL; /* * XXXCSJP: It is unclear to me whether or not it makes * sense for divert sockets to have options. However, * for now we will duplicate them with the INP locks * held so we can use them in ip_output() without * requring a reference to the pcb. */ if (inp->inp_options != NULL) { options = m_dup(inp->inp_options, M_DONTWAIT); if (options == NULL) error = ENOBUFS; } INP_RUNLOCK(inp); INP_INFO_WUNLOCK(&V_divcbinfo); if (error == ENOBUFS) { m_freem(m); return (error); } error = ip_output(m, options, NULL, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL); if (options != NULL) m_freem(options); } } else { dt->info |= IP_FW_DIVERT_LOOPBACK_FLAG; if (m->m_pkthdr.rcvif == NULL) { /* * No luck with the name, check by IP address. * Clear the port and the ifname to make sure * there are no distractions for ifa_ifwithaddr. */ struct ifaddr *ifa; bzero(sin->sin_zero, sizeof(sin->sin_zero)); sin->sin_port = 0; ifa = ifa_ifwithaddr((struct sockaddr *) sin); if (ifa == NULL) { error = EADDRNOTAVAIL; goto cantsend; } m->m_pkthdr.rcvif = ifa->ifa_ifp; } #ifdef MAC SOCK_LOCK(so); mac_socket_create_mbuf(so, m); SOCK_UNLOCK(so); #endif /* Send packet to input processing via netisr */ netisr_queue(NETISR_IP, m); } return error; cantsend: m_freem(m); return error; } static int div_attach(struct socket *so, int proto, struct thread *td) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("div_attach: inp != NULL")); if (td != NULL) { error = priv_check(td, PRIV_NETINET_DIVERT); if (error) return (error); } error = soreserve(so, div_sendspace, div_recvspace); if (error) return error; INP_INFO_WLOCK(&V_divcbinfo); error = in_pcballoc(so, &V_divcbinfo); if (error) { INP_INFO_WUNLOCK(&V_divcbinfo); return error; } inp = (struct inpcb *)so->so_pcb; INP_INFO_WUNLOCK(&V_divcbinfo); inp->inp_ip_p = proto; inp->inp_vflag |= INP_IPV4; inp->inp_flags |= INP_HDRINCL; INP_WUNLOCK(inp); return 0; } static void div_detach(struct socket *so) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_detach: inp == NULL")); INP_INFO_WLOCK(&V_divcbinfo); INP_WLOCK(inp); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_divcbinfo); } static int div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_bind: inp == NULL")); /* in_pcbbind assumes that nam is a sockaddr_in * and in_pcbbind requires a valid address. Since divert * sockets don't we need to make sure the address is * filled in properly. * XXX -- divert should not be abusing in_pcbind * and should probably have its own family. */ if (nam->sa_family != AF_INET) return EAFNOSUPPORT; ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; INP_INFO_WLOCK(&V_divcbinfo); INP_WLOCK(inp); error = in_pcbbind(inp, nam, td->td_ucred); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_divcbinfo); return error; } static int div_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return 0; } static int div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { INIT_VNET_INET(so->so_vnet); /* Packet must have a header (but that's about it) */ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == 0) { V_ipstat.ips_toosmall++; m_freem(m); return EINVAL; } /* Send packet */ return div_output(so, m, (struct sockaddr_in *)nam, control); } void div_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct in_addr faddr; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (PRC_IS_REDIRECT(cmd)) return; } static int div_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_divcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&V_divcbinfo); gencnt = V_divcbinfo.ipi_gencnt; n = V_divcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_divcbinfo); error = sysctl_wire_old_buffer(req, 2 * sizeof(xig) + n*sizeof(struct xinpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return ENOMEM; INP_INFO_RLOCK(&V_divcbinfo); for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) inp_list[i++] = inp; INP_RUNLOCK(inp); } INP_INFO_RUNLOCK(&V_divcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_INFO_RLOCK(&V_divcbinfo); xig.xig_gen = V_divcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_divcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_divcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "IPDIVERT"); SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0, div_pcblist, "S,xinpcb", "List of active divert sockets"); #endif struct pr_usrreqs div_usrreqs = { .pru_attach = div_attach, .pru_bind = div_bind, .pru_control = in_control, .pru_detach = div_detach, .pru_peeraddr = in_getpeeraddr, .pru_send = div_send, .pru_shutdown = div_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel }; struct protosw div_protosw = { .pr_type = SOCK_RAW, .pr_protocol = IPPROTO_DIVERT, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = div_input, .pr_ctlinput = div_ctlinput, .pr_ctloutput = ip_ctloutput, .pr_init = div_init, .pr_usrreqs = &div_usrreqs }; static int div_modevent(module_t mod, int type, void *unused) { int err = 0; int n; switch (type) { case MOD_LOAD: /* * Protocol will be initialized by pf_proto_register(). * We don't have to register ip_protox because we are not * a true IP protocol that goes over the wire. */ err = pf_proto_register(PF_INET, &div_protosw); ip_divert_ptr = divert_packet; break; case MOD_QUIESCE: /* * IPDIVERT may normally not be unloaded because of the * potential race conditions. Tell kldunload we can't be * unloaded unless the unload is forced. */ err = EPERM; break; case MOD_UNLOAD: /* * Forced unload. * * Module ipdivert can only be unloaded if no sockets are * connected. Maybe this can be changed later to forcefully * disconnect any open sockets. * * XXXRW: Note that there is a slight race here, as a new * socket open request could be spinning on the lock and then * we destroy the lock. */ INP_INFO_WLOCK(&V_divcbinfo); n = V_divcbinfo.ipi_count; if (n != 0) { err = EBUSY; INP_INFO_WUNLOCK(&V_divcbinfo); break; } ip_divert_ptr = NULL; err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); INP_INFO_WUNLOCK(&V_divcbinfo); INP_INFO_LOCK_DESTROY(&V_divcbinfo); uma_zdestroy(V_divcbinfo.ipi_zone); break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipdivertmod = { "ipdivert", div_modevent, 0 }; DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); MODULE_VERSION(ipdivert, 1); Index: head/sys/netinet/ip_fastfwd.c =================================================================== --- head/sys/netinet/ip_fastfwd.c (revision 185087) +++ head/sys/netinet/ip_fastfwd.c (revision 185088) @@ -1,616 +1,618 @@ /*- * Copyright (c) 2003 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * ip_fastforward gets its speed from processing the forwarded packet to * completion (if_output on the other side) without any queues or netisr's. * The receiving interface DMAs the packet into memory, the upper half of * driver calls ip_fastforward, we do our routing table lookup and directly * send it off to the outgoing interface, which DMAs the packet to the * network card. The only part of the packet we touch with the CPU is the * IP header (unless there are complex firewall rules touching other parts * of the packet, but that is up to you). We are essentially limited by bus * bandwidth and how fast the network card/driver can set up receives and * transmits. * * We handle basic errors, IP header errors, checksum errors, * destination unreachable, fragmentation and fragmentation needed and * report them via ICMP to the sender. * * Else if something is not pure IPv4 unicast forwarding we fall back to * the normal ip_input processing path. We should only be called from * interfaces connected to the outside world. * * Firewalling is fully supported including divert, ipfw fwd and ipfilter * ipnat and address rewrite. * * IPSEC is not supported if this host is a tunnel broker. IPSEC is * supported for connections to/from local host. * * We try to do the least expensive (in CPU ops) checks and operations * first to catch junk with as little overhead as possible. * * We take full advantage of hardware support for IP checksum and * fragmentation offloading. * * We don't do ICMP redirect in the fast forwarding path. I have had my own * cases where two core routers with Zebra routing suite would send millions * ICMP redirects to connected hosts if the destination router was not the * default gateway. In one case it was filling the routing table of a host * with approximately 300.000 cloned redirect entries until it ran out of * kernel memory. However the networking code proved very robust and it didn't * crash or fail in other ways. */ /* * Many thanks to Matt Thomas of NetBSD for basic structure of ip_flow.c which * is being followed here. */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" #include "opt_ipstealth.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -static int ipfastforward_active = 0; +#ifdef VIMAGE_GLOBALS +static int ipfastforward_active; +#endif SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_RW, ipfastforward_active, 0, "Enable fast IP forwarding"); static struct sockaddr_in * ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m) { INIT_VNET_INET(curvnet); struct sockaddr_in *dst; struct rtentry *rt; /* * Find route to destination. */ bzero(ro, sizeof(*ro)); dst = (struct sockaddr_in *)&ro->ro_dst; dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr.s_addr = dest.s_addr; in_rtalloc_ign(ro, RTF_CLONING, M_GETFIB(m)); /* * Route there and interface still up? */ rt = ro->ro_rt; if (rt && (rt->rt_flags & RTF_UP) && (rt->rt_ifp->if_flags & IFF_UP) && (rt->rt_ifp->if_drv_flags & IFF_DRV_RUNNING)) { if (rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)rt->rt_gateway; } else { V_ipstat.ips_noroute++; V_ipstat.ips_cantforward++; if (rt) RTFREE(rt); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); return NULL; } return dst; } /* * Try to forward a packet based on the destination address. * This is a fast path optimized for the plain forwarding case. * If the packet is handled (and consumed) here then we return 1; * otherwise 0 is returned and the packet should be delivered * to ip_input for full processing. */ struct mbuf * ip_fastforward(struct mbuf *m) { INIT_VNET_INET(curvnet); struct ip *ip; struct mbuf *m0 = NULL; struct route ro; struct sockaddr_in *dst = NULL; struct ifnet *ifp; struct in_addr odest, dest; u_short sum, ip_len; int error = 0; int hlen, mtu; #ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag; #endif /* * Are we active and forwarding packets? */ if (!V_ipfastforward_active || !V_ipforwarding) return m; M_ASSERTVALID(m); M_ASSERTPKTHDR(m); ro.ro_rt = NULL; /* * Step 1: check for packet drop conditions (and sanity checks) */ /* * Is entire packet big enough? */ if (m->m_pkthdr.len < sizeof(struct ip)) { V_ipstat.ips_tooshort++; goto drop; } /* * Is first mbuf large enough for ip header and is header present? */ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { V_ipstat.ips_toosmall++; return NULL; /* mbuf already free'd */ } ip = mtod(m, struct ip *); /* * Is it IPv4? */ if (ip->ip_v != IPVERSION) { V_ipstat.ips_badvers++; goto drop; } /* * Is IP header length correct and is it in first mbuf? */ hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ V_ipstat.ips_badlen++; goto drop; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { V_ipstat.ips_badhlen++; return NULL; /* mbuf already free'd */ } ip = mtod(m, struct ip *); } /* * Checksum correct? */ if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); else { if (hlen == sizeof(struct ip)) sum = in_cksum_hdr(ip); else sum = in_cksum(m, hlen); } if (sum) { V_ipstat.ips_badsum++; goto drop; } /* * Remember that we have checked the IP header and found it valid. */ m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); ip_len = ntohs(ip->ip_len); /* * Is IP length longer than packet we have got? */ if (m->m_pkthdr.len < ip_len) { V_ipstat.ips_tooshort++; goto drop; } /* * Is packet longer than IP header tells us? If yes, truncate packet. */ if (m->m_pkthdr.len > ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip_len; m->m_pkthdr.len = ip_len; } else m_adj(m, ip_len - m->m_pkthdr.len); } /* * Is packet from or to 127/8? */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { V_ipstat.ips_badaddr++; goto drop; } #ifdef ALTQ /* * Is packet dropped by traffic conditioner? */ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) goto drop; #endif /* * Step 2: fallback conditions to normal ip_input path processing */ /* * Only IP packets without options */ if (ip->ip_hl != (sizeof(struct ip) >> 2)) { if (ip_doopts == 1) return m; else if (ip_doopts == 2) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB, 0, 0); return NULL; /* mbuf already free'd */ } /* else ignore IP options and continue */ } /* * Only unicast IP, not from loopback, no L2 or IP broadcast, * no multicast, no INADDR_ANY * * XXX: Probably some of these checks could be direct drop * conditions. However it is not clear whether there are some * hacks or obscure behaviours which make it neccessary to * let ip_input handle it. We play safe here and let ip_input * deal with it until it is proven that we can directly drop it. */ if ((m->m_flags & (M_BCAST|M_MCAST)) || (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || ntohl(ip->ip_src.s_addr) == (u_long)INADDR_BROADCAST || ntohl(ip->ip_dst.s_addr) == (u_long)INADDR_BROADCAST || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) || ip->ip_src.s_addr == INADDR_ANY || ip->ip_dst.s_addr == INADDR_ANY ) return m; /* * Is it for a local address on this host? */ if (in_localip(ip->ip_dst)) return m; V_ipstat.ips_total++; /* * Step 3: incoming packet firewall processing */ /* * Convert to host representation */ ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); odest.s_addr = dest.s_addr = ip->ip_dst.s_addr; /* * Run through list of ipfilter hooks for input packets */ if (!PFIL_HOOKED(&inet_pfil_hook)) goto passin; if (pfil_run_hooks(&inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL) || m == NULL) goto drop; M_ASSERTVALID(m); M_ASSERTPKTHDR(m); ip = mtod(m, struct ip *); /* m may have changed by pfil hook */ dest.s_addr = ip->ip_dst.s_addr; /* * Destination address changed? */ if (odest.s_addr != dest.s_addr) { /* * Is it now for a local address on this host? */ if (in_localip(dest)) goto forwardlocal; /* * Go on with new destination address */ } #ifdef IPFIREWALL_FORWARD if (m->m_flags & M_FASTFWD_OURS) { /* * ipfw changed it for a local address on this host. */ goto forwardlocal; } #endif /* IPFIREWALL_FORWARD */ passin: /* * Step 4: decrement TTL and look up route */ /* * Check TTL */ #ifdef IPSTEALTH if (!V_ipstealth) { #endif if (ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return NULL; /* mbuf already free'd */ } /* * Decrement the TTL and incrementally change the IP header checksum. * Don't bother doing this with hw checksum offloading, it's faster * doing it right here. */ ip->ip_ttl -= IPTTLDEC; if (ip->ip_sum >= (u_int16_t) ~htons(IPTTLDEC << 8)) ip->ip_sum -= ~htons(IPTTLDEC << 8); else ip->ip_sum += htons(IPTTLDEC << 8); #ifdef IPSTEALTH } #endif /* * Find route to destination. */ if ((dst = ip_findroute(&ro, dest, m)) == NULL) return NULL; /* icmp unreach already sent */ ifp = ro.ro_rt->rt_ifp; /* * Immediately drop blackholed traffic, and directed broadcasts * for either the all-ones or all-zero subnet addresses on * locally attached networks. */ if ((ro.ro_rt->rt_flags & (RTF_BLACKHOLE|RTF_BROADCAST)) != 0) goto drop; /* * Step 5: outgoing firewall packet processing */ /* * Run through list of hooks for output packets. */ if (!PFIL_HOOKED(&inet_pfil_hook)) goto passout; if (pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, NULL) || m == NULL) { goto drop; } M_ASSERTVALID(m); M_ASSERTPKTHDR(m); ip = mtod(m, struct ip *); dest.s_addr = ip->ip_dst.s_addr; /* * Destination address changed? */ #ifndef IPFIREWALL_FORWARD if (odest.s_addr != dest.s_addr) { #else fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (odest.s_addr != dest.s_addr || fwd_tag != NULL) { #endif /* IPFIREWALL_FORWARD */ /* * Is it now for a local address on this host? */ #ifndef IPFIREWALL_FORWARD if (in_localip(dest)) { #else if (m->m_flags & M_FASTFWD_OURS || in_localip(dest)) { #endif /* IPFIREWALL_FORWARD */ forwardlocal: /* * Return packet for processing by ip_input(). * Keep host byte order as expected at ip_input's * "ours"-label. */ m->m_flags |= M_FASTFWD_OURS; if (ro.ro_rt) RTFREE(ro.ro_rt); return m; } /* * Redo route lookup with new destination address */ #ifdef IPFIREWALL_FORWARD if (fwd_tag) { dest.s_addr = ((struct sockaddr_in *) (fwd_tag + 1))->sin_addr.s_addr; m_tag_delete(m, fwd_tag); } #endif /* IPFIREWALL_FORWARD */ RTFREE(ro.ro_rt); if ((dst = ip_findroute(&ro, dest, m)) == NULL) return NULL; /* icmp unreach already sent */ ifp = ro.ro_rt->rt_ifp; } passout: /* * Step 6: send off the packet */ /* * Check if route is dampned (when ARP is unable to resolve) */ if ((ro.ro_rt->rt_flags & RTF_REJECT) && (ro.ro_rt->rt_rmx.rmx_expire == 0 || time_uptime < ro.ro_rt->rt_rmx.rmx_expire)) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); goto consumed; } #ifndef ALTQ /* * Check if there is enough space in the interface queue */ if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= ifp->if_snd.ifq_maxlen) { V_ipstat.ips_odropped++; /* would send source quench here but that is depreciated */ goto drop; } #endif /* * Check if media link state of interface is not down */ if (ifp->if_link_state == LINK_STATE_DOWN) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); goto consumed; } /* * Check if packet fits MTU or if hardware will fragment for us */ if (ro.ro_rt->rt_rmx.rmx_mtu) mtu = min(ro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); else mtu = ifp->if_mtu; if (ip->ip_len <= mtu || (ifp->if_hwassist & CSUM_FRAGMENT && (ip->ip_off & IP_DF) == 0)) { /* * Restore packet header fields to original values */ ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); /* * Send off the packet via outgoing interface */ error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro.ro_rt); } else { /* * Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery */ if (ip->ip_off & IP_DF) { V_ipstat.ips_cantfrag++; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu); goto consumed; } else { /* * We have to fragment the packet */ m->m_pkthdr.csum_flags |= CSUM_IP; /* * ip_fragment expects ip_len and ip_off in host byte * order but returns all packets in network byte order */ if (ip_fragment(ip, &m, mtu, ifp->if_hwassist, (~ifp->if_hwassist & CSUM_DELAY_IP))) { goto drop; } KASSERT(m != NULL, ("null mbuf and no error")); /* * Send off the fragments via outgoing interface */ error = 0; do { m0 = m->m_nextpkt; m->m_nextpkt = NULL; error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro.ro_rt); if (error) break; } while ((m = m0) != NULL); if (error) { /* Reclaim remaining fragments */ for (m = m0; m; m = m0) { m0 = m->m_nextpkt; m_freem(m); } } else V_ipstat.ips_fragmented++; } } if (error != 0) V_ipstat.ips_odropped++; else { ro.ro_rt->rt_rmx.rmx_pksent++; V_ipstat.ips_forward++; V_ipstat.ips_fastforward++; } consumed: RTFREE(ro.ro_rt); return NULL; drop: if (m) m_freem(m); if (ro.ro_rt) RTFREE(ro.ro_rt); return NULL; } Index: head/sys/netinet/ip_icmp.c =================================================================== --- head/sys/netinet/ip_icmp.c (revision 185087) +++ head/sys/netinet/ip_icmp.c (revision 185088) @@ -1,934 +1,953 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipsec.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif #include #include /* * ICMP routines: error generation, receive packet processing, and * routines to turnaround packets back to the originator, and * host table maintenance routines. */ -struct icmpstat icmpstat; +#ifdef VIMAGE_GLOBALS +struct icmpstat icmpstat; +static int icmpmaskrepl; +static u_int icmpmaskfake; +static int drop_redirect; +static int log_redirect; +static int icmplim; +static int icmplim_output; +static char reply_src[IFNAMSIZ]; +static int icmp_rfi; +static int icmp_quotelen; +static int icmpbmcastecho; +#endif + SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW, icmpstat, icmpstat, ""); -static int icmpmaskrepl = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW, icmpmaskrepl, 0, "Reply to ICMP Address Mask Request packets."); -static u_int icmpmaskfake = 0; SYSCTL_V_UINT(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_RW, icmpmaskfake, 0, "Fake reply to ICMP Address Mask Request packets."); -static int drop_redirect = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW, drop_redirect, 0, "Ignore ICMP redirects"); -static int log_redirect = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW, log_redirect, 0, "Log ICMP redirects to the console"); -static int icmplim = 200; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW, icmplim, 0, "Maximum number of ICMP responses per second"); -static int icmplim_output = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW, icmplim_output, 0, "Enable rate limiting of ICMP responses"); -static char reply_src[IFNAMSIZ]; SYSCTL_V_STRING(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_RW, reply_src, IFNAMSIZ, "icmp reply source for non-local packets."); -static int icmp_rfi = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_RW, icmp_rfi, 0, "ICMP reply from incoming interface for " "non-local packets"); -static int icmp_quotelen = 8; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_RW, icmp_quotelen, 0, "Number of bytes from original packet to " "quote in ICMP reply"); /* * ICMP broadcast echo sysctl */ -static int icmpbmcastecho = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW, icmpbmcastecho, 0, ""); #ifdef ICMPPRINTFS int icmpprintfs = 0; #endif static void icmp_reflect(struct mbuf *); static void icmp_send(struct mbuf *, struct mbuf *); extern struct protosw inetsw[]; + +void +icmp_init(void) +{ + INIT_VNET_INET(curvnet); + + V_icmpmaskrepl = 0; + V_icmpmaskfake = 0; + V_drop_redirect = 0; + V_log_redirect = 0; + V_icmplim = 200; + V_icmplim_output = 1; + V_icmp_rfi = 0; + V_icmp_quotelen = 8; + V_icmpbmcastecho = 0; +} /* * Generate an error packet of type error * in response to bad packet ip. */ void icmp_error(struct mbuf *n, int type, int code, n_long dest, int mtu) { INIT_VNET_INET(curvnet); register struct ip *oip = mtod(n, struct ip *), *nip; register unsigned oiphlen = oip->ip_hl << 2; register struct icmp *icp; register struct mbuf *m; unsigned icmplen, icmpelen, nlen; KASSERT((u_int)type <= ICMP_MAXTYPE, ("%s: illegal ICMP type", __func__)); #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_error(%p, %x, %d)\n", oip, type, code); #endif if (type != ICMP_REDIRECT) V_icmpstat.icps_error++; /* * Don't send error: * if the original packet was encrypted. * if not the first fragment of message. * in response to a multicast or broadcast packet. * if the old packet protocol was an ICMP error message. */ if (n->m_flags & M_DECRYPTED) goto freeit; if (oip->ip_off & ~(IP_MF|IP_DF)) goto freeit; if (n->m_flags & (M_BCAST|M_MCAST)) goto freeit; if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && n->m_len >= oiphlen + ICMP_MINLEN && !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiphlen))->icmp_type)) { V_icmpstat.icps_oldicmp++; goto freeit; } /* Drop if IP header plus 8 bytes is not contignous in first mbuf. */ if (oiphlen + 8 > n->m_len) goto freeit; /* * Calculate length to quote from original packet and * prevent the ICMP mbuf from overflowing. * Unfortunatly this is non-trivial since ip_forward() * sends us truncated packets. */ nlen = m_length(n, NULL); if (oip->ip_p == IPPROTO_TCP) { struct tcphdr *th; int tcphlen; if (oiphlen + sizeof(struct tcphdr) > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + sizeof(struct tcphdr) && ((n = m_pullup(n, oiphlen + sizeof(struct tcphdr))) == NULL)) goto freeit; th = (struct tcphdr *)((caddr_t)oip + oiphlen); tcphlen = th->th_off << 2; if (tcphlen < sizeof(struct tcphdr)) goto freeit; if (oip->ip_len < oiphlen + tcphlen) goto freeit; if (oiphlen + tcphlen > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + tcphlen && ((n = m_pullup(n, oiphlen + tcphlen)) == NULL)) goto freeit; icmpelen = max(tcphlen, min(V_icmp_quotelen, oip->ip_len - oiphlen)); } else stdreply: icmpelen = max(8, min(V_icmp_quotelen, oip->ip_len - oiphlen)); icmplen = min(oiphlen + icmpelen, nlen); if (icmplen < sizeof(struct ip)) goto freeit; if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen) m = m_gethdr(M_DONTWAIT, MT_DATA); else m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto freeit; #ifdef MAC mac_netinet_icmp_reply(n, m); #endif icmplen = min(icmplen, M_TRAILINGSPACE(m) - sizeof(struct ip) - ICMP_MINLEN); m_align(m, ICMP_MINLEN + icmplen); m->m_len = ICMP_MINLEN + icmplen; /* XXX MRT make the outgoing packet use the same FIB * that was associated with the incoming packet */ M_SETFIB(m, M_GETFIB(n)); icp = mtod(m, struct icmp *); V_icmpstat.icps_outhist[type]++; icp->icmp_type = type; if (type == ICMP_REDIRECT) icp->icmp_gwaddr.s_addr = dest; else { icp->icmp_void = 0; /* * The following assignments assume an overlay with the * just zeroed icmp_void field. */ if (type == ICMP_PARAMPROB) { icp->icmp_pptr = code; code = 0; } else if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG && mtu) { icp->icmp_nextmtu = htons(mtu); } } icp->icmp_code = code; /* * Copy the quotation into ICMP message and * convert quoted IP header back to network representation. */ m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip); nip = &icp->icmp_ip; nip->ip_len = htons(nip->ip_len); nip->ip_off = htons(nip->ip_off); /* * Set up ICMP message mbuf and copy old IP header (without options * in front of ICMP message. * If the original mbuf was meant to bypass the firewall, the error * reply should bypass as well. */ m->m_flags |= n->m_flags & M_SKIP_FIREWALL; m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; nip = mtod(m, struct ip *); bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); nip->ip_len = m->m_len; nip->ip_v = IPVERSION; nip->ip_hl = 5; nip->ip_p = IPPROTO_ICMP; nip->ip_tos = 0; icmp_reflect(m); freeit: m_freem(n); } /* * Process a received ICMP message. */ void icmp_input(struct mbuf *m, int off) { INIT_VNET_INET(curvnet); struct icmp *icp; struct in_ifaddr *ia; struct ip *ip = mtod(m, struct ip *); struct sockaddr_in icmpsrc, icmpdst, icmpgw; int hlen = off; int icmplen = ip->ip_len; int i, code; void (*ctlfunc)(int, struct sockaddr *, void *); int fibnum; /* * Locate icmp structure in mbuf, and check * that not corrupted and of at least minimum length. */ #ifdef ICMPPRINTFS if (icmpprintfs) { char buf[4 * sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_src)); printf("icmp_input from %s to %s, len %d\n", buf, inet_ntoa(ip->ip_dst), icmplen); } #endif if (icmplen < ICMP_MINLEN) { V_icmpstat.icps_tooshort++; goto freeit; } i = hlen + min(icmplen, ICMP_ADVLENMIN); if (m->m_len < i && (m = m_pullup(m, i)) == 0) { V_icmpstat.icps_tooshort++; return; } ip = mtod(m, struct ip *); m->m_len -= hlen; m->m_data += hlen; icp = mtod(m, struct icmp *); if (in_cksum(m, icmplen)) { V_icmpstat.icps_checksum++; goto freeit; } m->m_len += hlen; m->m_data -= hlen; if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { /* * Deliver very specific ICMP type only. */ switch (icp->icmp_type) { case ICMP_UNREACH: case ICMP_TIMXCEED: break; default: goto freeit; } } #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_input, type %d code %d\n", icp->icmp_type, icp->icmp_code); #endif /* * Message type specific processing. */ if (icp->icmp_type > ICMP_MAXTYPE) goto raw; /* Initialize */ bzero(&icmpsrc, sizeof(icmpsrc)); icmpsrc.sin_len = sizeof(struct sockaddr_in); icmpsrc.sin_family = AF_INET; bzero(&icmpdst, sizeof(icmpdst)); icmpdst.sin_len = sizeof(struct sockaddr_in); icmpdst.sin_family = AF_INET; bzero(&icmpgw, sizeof(icmpgw)); icmpgw.sin_len = sizeof(struct sockaddr_in); icmpgw.sin_family = AF_INET; V_icmpstat.icps_inhist[icp->icmp_type]++; code = icp->icmp_code; switch (icp->icmp_type) { case ICMP_UNREACH: switch (code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_SRCFAIL: case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_ISOLATED: case ICMP_UNREACH_TOSNET: case ICMP_UNREACH_TOSHOST: case ICMP_UNREACH_HOST_PRECEDENCE: case ICMP_UNREACH_PRECEDENCE_CUTOFF: code = PRC_UNREACH_NET; break; case ICMP_UNREACH_NEEDFRAG: code = PRC_MSGSIZE; break; /* * RFC 1122, Sections 3.2.2.1 and 4.2.3.9. * Treat subcodes 2,3 as immediate RST */ case ICMP_UNREACH_PROTOCOL: case ICMP_UNREACH_PORT: code = PRC_UNREACH_PORT; break; case ICMP_UNREACH_NET_PROHIB: case ICMP_UNREACH_HOST_PROHIB: case ICMP_UNREACH_FILTER_PROHIB: code = PRC_UNREACH_ADMIN_PROHIB; break; default: goto badcode; } goto deliver; case ICMP_TIMXCEED: if (code > 1) goto badcode; code += PRC_TIMXCEED_INTRANS; goto deliver; case ICMP_PARAMPROB: if (code > 1) goto badcode; code = PRC_PARAMPROB; goto deliver; case ICMP_SOURCEQUENCH: if (code) goto badcode; code = PRC_QUENCH; deliver: /* * Problem with datagram; advise higher level routines. */ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { V_icmpstat.icps_badlen++; goto freeit; } icp->icmp_ip.ip_len = ntohs(icp->icmp_ip.ip_len); /* Discard ICMP's in response to multicast packets */ if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) goto badcode; #ifdef ICMPPRINTFS if (icmpprintfs) printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; /* * XXX if the packet contains [IPv4 AH TCP], we can't make a * notification to TCP layer. */ ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput; if (ctlfunc) (*ctlfunc)(code, (struct sockaddr *)&icmpsrc, (void *)&icp->icmp_ip); break; badcode: V_icmpstat.icps_badcode++; break; case ICMP_ECHO: if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { V_icmpstat.icps_bmcastecho++; break; } icp->icmp_type = ICMP_ECHOREPLY; if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0) goto freeit; else goto reflect; case ICMP_TSTAMP: if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { V_icmpstat.icps_bmcasttstamp++; break; } if (icmplen < ICMP_TSLEN) { V_icmpstat.icps_badlen++; break; } icp->icmp_type = ICMP_TSTAMPREPLY; icp->icmp_rtime = iptime(); icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0) goto freeit; else goto reflect; case ICMP_MASKREQ: if (V_icmpmaskrepl == 0) break; /* * We are not able to respond with all ones broadcast * unless we receive it over a point-to-point interface. */ if (icmplen < ICMP_MASKLEN) break; switch (ip->ip_dst.s_addr) { case INADDR_BROADCAST: case INADDR_ANY: icmpdst.sin_addr = ip->ip_src; break; default: icmpdst.sin_addr = ip->ip_dst; } ia = (struct in_ifaddr *)ifaof_ifpforaddr( (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); if (ia == 0) break; if (ia->ia_ifp == 0) break; icp->icmp_type = ICMP_MASKREPLY; if (V_icmpmaskfake == 0) icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; else icp->icmp_mask = V_icmpmaskfake; if (ip->ip_src.s_addr == 0) { if (ia->ia_ifp->if_flags & IFF_BROADCAST) ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr; else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; } reflect: ip->ip_len += hlen; /* since ip_input deducts this */ V_icmpstat.icps_reflect++; V_icmpstat.icps_outhist[icp->icmp_type]++; icmp_reflect(m); return; case ICMP_REDIRECT: if (V_log_redirect) { u_long src, dst, gw; src = ntohl(ip->ip_src.s_addr); dst = ntohl(icp->icmp_ip.ip_dst.s_addr); gw = ntohl(icp->icmp_gwaddr.s_addr); printf("icmp redirect from %d.%d.%d.%d: " "%d.%d.%d.%d => %d.%d.%d.%d\n", (int)(src >> 24), (int)((src >> 16) & 0xff), (int)((src >> 8) & 0xff), (int)(src & 0xff), (int)(dst >> 24), (int)((dst >> 16) & 0xff), (int)((dst >> 8) & 0xff), (int)(dst & 0xff), (int)(gw >> 24), (int)((gw >> 16) & 0xff), (int)((gw >> 8) & 0xff), (int)(gw & 0xff)); } /* * RFC1812 says we must ignore ICMP redirects if we * are acting as router. */ if (V_drop_redirect || V_ipforwarding) break; if (code > 3) goto badcode; if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { V_icmpstat.icps_badlen++; break; } /* * Short circuit routing redirects to force * immediate change in the kernel's routing * tables. The message is also handed to anyone * listening on a raw socket (e.g. the routing * daemon for use in updating its tables). */ icmpgw.sin_addr = ip->ip_src; icmpdst.sin_addr = icp->icmp_gwaddr; #ifdef ICMPPRINTFS if (icmpprintfs) { char buf[4 * sizeof "123"]; strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst)); printf("redirect dst %s to %s\n", buf, inet_ntoa(icp->icmp_gwaddr)); } #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { in_rtredirect((struct sockaddr *)&icmpsrc, (struct sockaddr *)&icmpdst, (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST, (struct sockaddr *)&icmpgw, fibnum); } pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); #ifdef IPSEC key_sa_routechange((struct sockaddr *)&icmpsrc); #endif break; /* * No kernel processing for the following; * just fall through to send to raw listener. */ case ICMP_ECHOREPLY: case ICMP_ROUTERADVERT: case ICMP_ROUTERSOLICIT: case ICMP_TSTAMPREPLY: case ICMP_IREQREPLY: case ICMP_MASKREPLY: default: break; } raw: rip_input(m, off); return; freeit: m_freem(m); } /* * Reflect the ip packet back to the source */ static void icmp_reflect(struct mbuf *m) { INIT_VNET_INET(curvnet); struct ip *ip = mtod(m, struct ip *); struct ifaddr *ifa; struct ifnet *ifn; struct in_ifaddr *ia; struct in_addr t; struct mbuf *opts = 0; int optlen = (ip->ip_hl << 2) - sizeof(struct ip); if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) || IN_ZERONET(ntohl(ip->ip_src.s_addr)) ) { m_freem(m); /* Bad return address */ V_icmpstat.icps_badaddr++; goto done; /* Ip_output() will check for broadcast */ } t = ip->ip_dst; ip->ip_dst = ip->ip_src; /* * Source selection for ICMP replies: * * If the incoming packet was addressed directly to one of our * own addresses, use dst as the src for the reply. */ LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) goto match; /* * If the incoming packet was addressed to one of our broadcast * addresses, use the first non-broadcast address which corresponds * to the incoming interface. */ if (m->m_pkthdr.rcvif != NULL && m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) { TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == t.s_addr) goto match; } } /* * If the packet was transiting through us, use the address of * the interface the packet came through in. If that interface * doesn't have a suitable IP address, the normal selection * criteria apply. */ if (V_icmp_rfi && m->m_pkthdr.rcvif != NULL) { TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); goto match; } } /* * If the incoming packet was not addressed directly to us, use * designated interface for icmp replies specified by sysctl * net.inet.icmp.reply_src (default not set). Otherwise continue * with normal source selection. */ if (V_reply_src[0] != '\0' && (ifn = ifunit(V_reply_src))) { TAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); goto match; } } /* * If the packet was transiting through us, use the address of * the interface that is the closest to the packet source. * When we don't have a route back to the packet source, stop here * and drop the packet. */ ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m)); if (ia == NULL) { m_freem(m); V_icmpstat.icps_noroute++; goto done; } match: #ifdef MAC mac_netinet_icmp_replyinplace(m); #endif t = IA_SIN(ia)->sin_addr; ip->ip_src = t; ip->ip_ttl = V_ip_defttl; if (optlen > 0) { register u_char *cp; int opt, cnt; u_int len; /* * Retrieve any source routing from the incoming packet; * add on any record-route or timestamp options. */ cp = (u_char *) (ip + 1); if ((opts = ip_srcroute(m)) == 0 && (opts = m_gethdr(M_DONTWAIT, MT_DATA))) { opts->m_len = sizeof(struct in_addr); mtod(opts, struct in_addr *)->s_addr = 0; } if (opts) { #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_reflect optlen %d rt %d => ", optlen, opts->m_len); #endif for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) len = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) break; len = cp[IPOPT_OLEN]; if (len < IPOPT_OLEN + sizeof(*cp) || len > cnt) break; } /* * Should check for overflow, but it "can't happen" */ if (opt == IPOPT_RR || opt == IPOPT_TS || opt == IPOPT_SECURITY) { bcopy((caddr_t)cp, mtod(opts, caddr_t) + opts->m_len, len); opts->m_len += len; } } /* Terminate & pad, if necessary */ cnt = opts->m_len % 4; if (cnt) { for (; cnt < 4; cnt++) { *(mtod(opts, caddr_t) + opts->m_len) = IPOPT_EOL; opts->m_len++; } } #ifdef ICMPPRINTFS if (icmpprintfs) printf("%d\n", opts->m_len); #endif } /* * Now strip out original options by copying rest of first * mbuf's data back, and adjust the IP length. */ ip->ip_len -= optlen; ip->ip_v = IPVERSION; ip->ip_hl = 5; m->m_len -= optlen; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len -= optlen; optlen += sizeof(struct ip); bcopy((caddr_t)ip + optlen, (caddr_t)(ip + 1), (unsigned)(m->m_len - sizeof(struct ip))); } m_tag_delete_nonpersistent(m); m->m_flags &= ~(M_BCAST|M_MCAST); icmp_send(m, opts); done: if (opts) (void)m_free(opts); } /* * Send an icmp packet back to the ip level, * after supplying a checksum. */ static void icmp_send(struct mbuf *m, struct mbuf *opts) { register struct ip *ip = mtod(m, struct ip *); register int hlen; register struct icmp *icp; hlen = ip->ip_hl << 2; m->m_data += hlen; m->m_len -= hlen; icp = mtod(m, struct icmp *); icp->icmp_cksum = 0; icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen); m->m_data -= hlen; m->m_len += hlen; m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef ICMPPRINTFS if (icmpprintfs) { char buf[4 * sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_dst)); printf("icmp_send dst %s src %s\n", buf, inet_ntoa(ip->ip_src)); } #endif (void) ip_output(m, opts, NULL, 0, NULL, NULL); } n_time iptime(void) { struct timeval atv; u_long t; getmicrotime(&atv); t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000; return (htonl(t)); } /* * Return the next larger or smaller MTU plateau (table from RFC 1191) * given current value MTU. If DIR is less than zero, a larger plateau * is returned; otherwise, a smaller value is returned. */ int ip_next_mtu(int mtu, int dir) { static int mtutab[] = { 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508, 296, 68, 0 }; int i, size; size = (sizeof mtutab) / (sizeof mtutab[0]); if (dir >= 0) { for (i = 0; i < size; i++) if (mtu > mtutab[i]) return mtutab[i]; } else { for (i = size - 1; i >= 0; i--) if (mtu < mtutab[i]) return mtutab[i]; if (mtu == mtutab[0]) return mtutab[0]; } return 0; } /* * badport_bandlim() - check for ICMP bandwidth limit * * Return 0 if it is ok to send an ICMP error response, -1 if we have * hit our bandwidth limit and it is not ok. * * If icmplim is <= 0, the feature is disabled and 0 is returned. * * For now we separate the TCP and UDP subsystems w/ different 'which' * values. We may eventually remove this separation (and simplify the * code further). * * Note that the printing of the error message is delayed so we can * properly print the icmp error rate that the system was trying to do * (i.e. 22000/100 pps, etc...). This can cause long delays in printing * the 'final' error, but it doesn't make sense to solve the printing * delay with more complex code. */ int badport_bandlim(int which) { INIT_VNET_INET(curvnet); #define N(a) (sizeof (a) / sizeof (a[0])) static struct rate { const char *type; struct timeval lasttime; int curpps; } rates[BANDLIM_MAX+1] = { { "icmp unreach response" }, { "icmp ping response" }, { "icmp tstamp response" }, { "closed port RST response" }, { "open port RST response" }, { "icmp6 unreach response" } }; /* * Return ok status if feature disabled or argument out of range. */ if (V_icmplim > 0 && (u_int) which < N(rates)) { struct rate *r = &rates[which]; int opps = r->curpps; if (!ppsratecheck(&r->lasttime, &r->curpps, V_icmplim)) return -1; /* discard packet */ /* * If we've dropped below the threshold after having * rate-limited traffic print the message. This preserves * the previous behaviour at the expense of added complexity. */ if (V_icmplim_output && opps > V_icmplim) printf("Limiting %s from %d to %d packets/sec\n", r->type, opps, V_icmplim); } return 0; /* okay to send packet */ #undef N } Index: head/sys/netinet/ip_icmp.h =================================================================== --- head/sys/netinet/ip_icmp.h (revision 185087) +++ head/sys/netinet/ip_icmp.h (revision 185088) @@ -1,210 +1,211 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_IP_ICMP_H_ #define _NETINET_IP_ICMP_H_ /* * Interface Control Message Protocol Definitions. * Per RFC 792, September 1981. */ /* * Internal of an ICMP Router Advertisement */ struct icmp_ra_addr { u_int32_t ira_addr; u_int32_t ira_preference; }; /* * Structure of an icmp header. */ struct icmphdr { u_char icmp_type; /* type of message, see below */ u_char icmp_code; /* type sub code */ u_short icmp_cksum; /* ones complement cksum of struct */ }; /* * Structure of an icmp packet. * * XXX: should start with a struct icmphdr. */ struct icmp { u_char icmp_type; /* type of message, see below */ u_char icmp_code; /* type sub code */ u_short icmp_cksum; /* ones complement cksum of struct */ union { u_char ih_pptr; /* ICMP_PARAMPROB */ struct in_addr ih_gwaddr; /* ICMP_REDIRECT */ struct ih_idseq { n_short icd_id; n_short icd_seq; } ih_idseq; int ih_void; /* ICMP_UNREACH_NEEDFRAG -- Path MTU Discovery (RFC1191) */ struct ih_pmtu { n_short ipm_void; n_short ipm_nextmtu; } ih_pmtu; struct ih_rtradv { u_char irt_num_addrs; u_char irt_wpa; u_int16_t irt_lifetime; } ih_rtradv; } icmp_hun; #define icmp_pptr icmp_hun.ih_pptr #define icmp_gwaddr icmp_hun.ih_gwaddr #define icmp_id icmp_hun.ih_idseq.icd_id #define icmp_seq icmp_hun.ih_idseq.icd_seq #define icmp_void icmp_hun.ih_void #define icmp_pmvoid icmp_hun.ih_pmtu.ipm_void #define icmp_nextmtu icmp_hun.ih_pmtu.ipm_nextmtu #define icmp_num_addrs icmp_hun.ih_rtradv.irt_num_addrs #define icmp_wpa icmp_hun.ih_rtradv.irt_wpa #define icmp_lifetime icmp_hun.ih_rtradv.irt_lifetime union { struct id_ts { /* ICMP Timestamp */ n_time its_otime; /* Originate */ n_time its_rtime; /* Receive */ n_time its_ttime; /* Transmit */ } id_ts; struct id_ip { struct ip idi_ip; /* options and then 64 bits of data */ } id_ip; struct icmp_ra_addr id_radv; u_int32_t id_mask; char id_data[1]; } icmp_dun; #define icmp_otime icmp_dun.id_ts.its_otime #define icmp_rtime icmp_dun.id_ts.its_rtime #define icmp_ttime icmp_dun.id_ts.its_ttime #define icmp_ip icmp_dun.id_ip.idi_ip #define icmp_radv icmp_dun.id_radv #define icmp_mask icmp_dun.id_mask #define icmp_data icmp_dun.id_data }; /* * Lower bounds on packet lengths for various types. * For the error advice packets must first insure that the * packet is large enough to contain the returned ip header. * Only then can we do the check to see if 64 bits of packet * data have been returned, since we need to check the returned * ip header length. */ #define ICMP_MINLEN 8 /* abs minimum */ #define ICMP_TSLEN (8 + 3 * sizeof (n_time)) /* timestamp */ #define ICMP_MASKLEN 12 /* address mask */ #define ICMP_ADVLENMIN (8 + sizeof (struct ip) + 8) /* min */ #define ICMP_ADVLEN(p) (8 + ((p)->icmp_ip.ip_hl << 2) + 8) /* N.B.: must separately check that ip_hl >= 5 */ /* * Definition of type and code field values. */ #define ICMP_ECHOREPLY 0 /* echo reply */ #define ICMP_UNREACH 3 /* dest unreachable, codes: */ #define ICMP_UNREACH_NET 0 /* bad net */ #define ICMP_UNREACH_HOST 1 /* bad host */ #define ICMP_UNREACH_PROTOCOL 2 /* bad protocol */ #define ICMP_UNREACH_PORT 3 /* bad port */ #define ICMP_UNREACH_NEEDFRAG 4 /* IP_DF caused drop */ #define ICMP_UNREACH_SRCFAIL 5 /* src route failed */ #define ICMP_UNREACH_NET_UNKNOWN 6 /* unknown net */ #define ICMP_UNREACH_HOST_UNKNOWN 7 /* unknown host */ #define ICMP_UNREACH_ISOLATED 8 /* src host isolated */ #define ICMP_UNREACH_NET_PROHIB 9 /* prohibited access */ #define ICMP_UNREACH_HOST_PROHIB 10 /* ditto */ #define ICMP_UNREACH_TOSNET 11 /* bad tos for net */ #define ICMP_UNREACH_TOSHOST 12 /* bad tos for host */ #define ICMP_UNREACH_FILTER_PROHIB 13 /* admin prohib */ #define ICMP_UNREACH_HOST_PRECEDENCE 14 /* host prec vio. */ #define ICMP_UNREACH_PRECEDENCE_CUTOFF 15 /* prec cutoff */ #define ICMP_SOURCEQUENCH 4 /* packet lost, slow down */ #define ICMP_REDIRECT 5 /* shorter route, codes: */ #define ICMP_REDIRECT_NET 0 /* for network */ #define ICMP_REDIRECT_HOST 1 /* for host */ #define ICMP_REDIRECT_TOSNET 2 /* for tos and net */ #define ICMP_REDIRECT_TOSHOST 3 /* for tos and host */ #define ICMP_ALTHOSTADDR 6 /* alternate host address */ #define ICMP_ECHO 8 /* echo service */ #define ICMP_ROUTERADVERT 9 /* router advertisement */ #define ICMP_ROUTERADVERT_NORMAL 0 /* normal advertisement */ #define ICMP_ROUTERADVERT_NOROUTE_COMMON 16 /* selective routing */ #define ICMP_ROUTERSOLICIT 10 /* router solicitation */ #define ICMP_TIMXCEED 11 /* time exceeded, code: */ #define ICMP_TIMXCEED_INTRANS 0 /* ttl==0 in transit */ #define ICMP_TIMXCEED_REASS 1 /* ttl==0 in reass */ #define ICMP_PARAMPROB 12 /* ip header bad */ #define ICMP_PARAMPROB_ERRATPTR 0 /* error at param ptr */ #define ICMP_PARAMPROB_OPTABSENT 1 /* req. opt. absent */ #define ICMP_PARAMPROB_LENGTH 2 /* bad length */ #define ICMP_TSTAMP 13 /* timestamp request */ #define ICMP_TSTAMPREPLY 14 /* timestamp reply */ #define ICMP_IREQ 15 /* information request */ #define ICMP_IREQREPLY 16 /* information reply */ #define ICMP_MASKREQ 17 /* address mask request */ #define ICMP_MASKREPLY 18 /* address mask reply */ #define ICMP_TRACEROUTE 30 /* traceroute */ #define ICMP_DATACONVERR 31 /* data conversion error */ #define ICMP_MOBILE_REDIRECT 32 /* mobile host redirect */ #define ICMP_IPV6_WHEREAREYOU 33 /* IPv6 where-are-you */ #define ICMP_IPV6_IAMHERE 34 /* IPv6 i-am-here */ #define ICMP_MOBILE_REGREQUEST 35 /* mobile registration req */ #define ICMP_MOBILE_REGREPLY 36 /* mobile registration reply */ #define ICMP_SKIP 39 /* SKIP */ #define ICMP_PHOTURIS 40 /* Photuris */ #define ICMP_PHOTURIS_UNKNOWN_INDEX 1 /* unknown sec index */ #define ICMP_PHOTURIS_AUTH_FAILED 2 /* auth failed */ #define ICMP_PHOTURIS_DECRYPT_FAILED 3 /* decrypt failed */ #define ICMP_MAXTYPE 40 #define ICMP_INFOTYPE(type) \ ((type) == ICMP_ECHOREPLY || (type) == ICMP_ECHO || \ (type) == ICMP_ROUTERADVERT || (type) == ICMP_ROUTERSOLICIT || \ (type) == ICMP_TSTAMP || (type) == ICMP_TSTAMPREPLY || \ (type) == ICMP_IREQ || (type) == ICMP_IREQREPLY || \ (type) == ICMP_MASKREQ || (type) == ICMP_MASKREPLY) #ifdef _KERNEL void icmp_error(struct mbuf *, int, int, n_long, int); void icmp_input(struct mbuf *, int); +void icmp_init(void); int ip_next_mtu(int, int); #endif #endif Index: head/sys/netinet/ip_input.c =================================================================== --- head/sys/netinet/ip_input.c (revision 185087) +++ head/sys/netinet/ip_input.c (revision 185088) @@ -1,1664 +1,1688 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_ipfw.h" #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_carp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_CARP #include #endif #ifdef IPSEC #include #endif /* IPSEC */ #include /* XXX: Temporary until ipfw_ether and ipfw_bridge are converted. */ #include #include #include #ifdef CTASSERT CTASSERT(sizeof(struct ip) == 20); #endif -int rsvp_on = 0; +#ifdef VIMAGE_GLOBALS +static int ipsendredirects; +static int ip_checkinterface; +static int ip_keepfaith; +static int ip_sendsourcequench; +int ip_defttl; +int ip_do_randomid; +int ipforwarding; +struct in_ifaddrhead in_ifaddrhead; /* first inet address */ +struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ +u_long in_ifaddrhmask; /* mask for hash table */ +struct ipstat ipstat; +static int ip_rsvp_on; +struct socket *ip_rsvpd; +int rsvp_on; +static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; +static int maxnipq; /* Administrative limit on # reass queues. */ +static int maxfragsperpacket; +int ipstealth; +static int nipq; /* Total # of reass queues */ +#endif -int ipforwarding = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW, ipforwarding, 0, "Enable IP forwarding between interfaces"); -static int ipsendredirects = 1; /* XXX */ SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, ipsendredirects, 0, "Enable sending IP redirects"); -int ip_defttl = IPDEFTTL; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, ip_defttl, 0, "Maximum TTL on IP packets"); -static int ip_keepfaith = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, ip_keepfaith, 0, "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); -static int ip_sendsourcequench = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW, ip_sendsourcequench, 0, "Enable the transmission of source quench packets"); -int ip_do_randomid = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW, ip_do_randomid, 0, "Assign random ip_id values"); /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table * and transmit implementation do not implement the Strong ES model, * setting this to 1 results in an odd hybrid. * * XXX - ip_checkinterface currently must be disabled if you use ipnat * to translate the destination address to another local interface. * * XXX - ip_checkinterface must be disabled if you add IP aliases * to the loopback interface instead of the interface where the * packets for those addresses are received. */ -static int ip_checkinterface = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, ip_checkinterface, 0, "Verify packet arrives on correct interface"); struct pfil_head inet_pfil_hook; /* Packet filter hooks */ static struct ifqueue ipintrq; static int ipqmaxlen = IFQ_MAXLEN; extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; -struct in_ifaddrhead in_ifaddrhead; /* first inet address */ -struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ -u_long in_ifaddrhmask; /* mask for hash table */ SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW, &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue"); SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD, &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue"); -struct ipstat ipstat; SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW, ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); -/* - * IP datagram reassembly. - */ -#define IPREASS_NHASH_LOG2 6 -#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) -#define IPREASS_HMASK (IPREASS_NHASH - 1) -#define IPREASS_HASH(x,y) \ - (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) - static uma_zone_t ipq_zone; -static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; static struct mtx ipqlock; #define IPQ_LOCK() mtx_lock(&ipqlock) #define IPQ_UNLOCK() mtx_unlock(&ipqlock) #define IPQ_LOCK_INIT() mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF) #define IPQ_LOCK_ASSERT() mtx_assert(&ipqlock, MA_OWNED) static void maxnipq_update(void); static void ipq_zone_change(void *); -static int maxnipq; /* Administrative limit on # reass queues. */ -static int nipq = 0; /* Total # of reass queues */ SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD, nipq, 0, "Current number of IPv4 fragment reassembly queue entries"); -static int maxfragsperpacket; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW, maxfragsperpacket, 0, "Maximum number of IPv4 fragments allowed per packet"); struct callout ipport_tick_callout; #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); #endif #ifdef IPSTEALTH -int ipstealth = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, ipstealth, 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif /* * ipfw_ether and ipfw_bridge hooks. * XXX: Temporary until those are converted to pfil_hooks as well. */ ip_fw_chk_t *ip_fw_chk_ptr = NULL; ip_dn_io_t *ip_dn_io_ptr = NULL; int fw_one_pass = 1; static void ip_freef(struct ipqhead *, struct ipq *); /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. */ void ip_init(void) { INIT_VNET_INET(curvnet); struct protosw *pr; int i; + V_ipsendredirects = 1; /* XXX */ + V_ip_checkinterface = 0; + V_ip_keepfaith = 0; + V_ip_sendsourcequench = 0; + V_rsvp_on = 0; + V_ip_defttl = IPDEFTTL; + V_ip_do_randomid = 0; + V_ipforwarding = 0; + V_ipstealth = 0; + V_nipq = 0; /* Total # of reass queues */ + + V_ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ + V_ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ + V_ipport_firstauto = IPPORT_EPHEMERALFIRST; /* 10000 */ + V_ipport_lastauto = IPPORT_EPHEMERALLAST; /* 65535 */ + V_ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ + V_ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ + V_ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */ + V_ipport_reservedlow = 0; + V_ipport_randomized = 1; /* user controlled via sysctl */ + V_ipport_randomcps = 10; /* user controlled via sysctl */ + V_ipport_randomtime = 45; /* user controlled via sysctl */ + V_ipport_stoprandom = 0; /* toggled by ipport_tick */ + +#ifdef NOTYET + /* XXX global static but not instantiated in this file */ + V_ipfastforward_active = 0; + V_subnetsarelocal = 0; + V_sameprefixcarponly = 0; +#endif + TAILQ_INIT(&V_in_ifaddrhead); V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip_protox[i] = pr - inetsw; /* * Cycle through IP protocols and put them into the appropriate place * in ip_protox[]. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip_protox[pr->pr_protocol] = pr - inetsw; } /* Initialize packet filter hooks. */ inet_pfil_hook.ph_type = PFIL_TYPE_AF; inet_pfil_hook.ph_af = AF_INET; if ((i = pfil_head_register(&inet_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); /* Initialize IP reassembly queue. */ IPQ_LOCK_INIT(); for (i = 0; i < IPREASS_NHASH; i++) TAILQ_INIT(&V_ipq[i]); V_maxnipq = nmbclusters / 32; V_maxfragsperpacket = 16; V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); maxnipq_update(); /* Start ipport_tick. */ callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); ipport_tick(NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change, NULL, EVENTHANDLER_PRI_ANY); /* Initialize various other remaining things. */ V_ip_id = time_second & 0xffff; ipintrq.ifq_maxlen = ipqmaxlen; mtx_init(&ipintrq.ifq_mtx, "ip_inq", NULL, MTX_DEF); netisr_register(NETISR_IP, ip_input, &ipintrq, 0); } void ip_fini(void *xtp) { callout_stop(&ipport_tick_callout); } /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. */ void ip_input(struct mbuf *m) { INIT_VNET_INET(curvnet); struct ip *ip = NULL; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; int checkif, hlen = 0; u_short sum; int dchg = 0; /* dest changed after fw */ struct in_addr odst; /* original dst address */ M_ASSERTPKTHDR(m); if (m->m_flags & M_FASTFWD_OURS) { /* * Firewall or NAT changed destination to local. * We expect ip_len and ip_off to be in host byte order. */ m->m_flags &= ~M_FASTFWD_OURS; /* Set up some basics that will be used later. */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; goto ours; } V_ipstat.ips_total++; if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { V_ipstat.ips_toosmall++; return; } ip = mtod(m, struct ip *); if (ip->ip_v != IPVERSION) { V_ipstat.ips_badvers++; goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ V_ipstat.ips_badhlen++; goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { V_ipstat.ips_badhlen++; return; } ip = mtod(m, struct ip *); } /* 127/8 must not appear on wire - RFC1122 */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { V_ipstat.ips_badaddr++; goto bad; } } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { V_ipstat.ips_badsum++; goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) /* packet is dropped by traffic conditioner */ return; #endif /* * Convert fields to host representation. */ ip->ip_len = ntohs(ip->ip_len); if (ip->ip_len < hlen) { V_ipstat.ips_badlen++; goto bad; } ip->ip_off = ntohs(ip->ip_off); /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < ip->ip_len) { tooshort: V_ipstat.ips_tooshort++; goto bad; } if (m->m_pkthdr.len > ip->ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip->ip_len; m->m_pkthdr.len = ip->ip_len; } else m_adj(m, ip->ip_len - m->m_pkthdr.len); } #ifdef IPSEC /* * Bypass packet filtering for packets from a tunnel (gif). */ if (ip_ipsec_filtertunnel(m)) goto passin; #endif /* IPSEC */ /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing (e.g. * by NAT rewriting). When this happens, tell * ip_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&inet_pfil_hook)) goto passin; odst = ip->ip_dst; if (pfil_run_hooks(&inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL) != 0) return; if (m == NULL) /* consumed by filter */ return; ip = mtod(m, struct ip *); dchg = (odst.s_addr != ip->ip_dst.s_addr); #ifdef IPFIREWALL_FORWARD if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; goto ours; } if ((dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL)) != 0) { /* * Directly ship on the packet. This allows to forward packets * that were destined for us to some other directly connected * host. */ ip_forward(m, dchg); return; } #endif /* IPFIREWALL_FORWARD */ passin: /* * Process options and, if not destined for us, * ship it on. ip_dooptions returns 1 when an * error was detected (causing an icmp message * to be sent and the original packet to be freed). */ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) return; /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no * matter if it is destined to another node, or whether it is * a multicast one, RSVP wants it! and prevents it from being forwarded * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) goto ours; /* * Check our list of addresses, to see if the packet is for us. * If we don't have any addresses, assume any unicast packet * we receive might be for us (and let the upper layers deal * with it). */ if (TAILQ_EMPTY(&V_in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; /* * Enable a consistency check between the destination address * and the arrival interface for a unicast packet (the RFC 1122 * strong ES model) if IP forwarding is disabled and the packet * is not locally generated and the packet is not subject to * 'ipfw fwd'. * * XXX - Checking also should be disabled if the destination * address is ipnat'ed to a different interface. * * XXX - Checking is incompatible with IP aliases added * to the loopback interface instead of the interface where * the packets are received. * * XXX - This is the case for carp vhost IPs as well so we * insert a workaround. If the packet got here, we already * checked with carp_iamatch() and carp_forus(). */ checkif = V_ip_checkinterface && (V_ipforwarding == 0) && m->m_pkthdr.rcvif != NULL && ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) && #ifdef DEV_CARP !m->m_pkthdr.rcvif->if_carp && #endif (dchg == 0); /* * Check for exact addresses in the hash bucket. */ LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { /* * If the address matches, verify that the packet * arrived via the correct interface if checking is * enabled. */ if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && (!checkif || ia->ia_ifp == m->m_pkthdr.rcvif)) goto ours; } /* * Check for broadcast addresses. * * Only accept broadcast packets that arrive via the matching * interface. Reception of forwarded directed broadcasts would * be handled via ip_forward() and ether_output() with the loopback * into the stack for SIMPLEX interfaces handled by ether_output(). */ if (m->m_pkthdr.rcvif != NULL && m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) { TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr) goto ours; if (ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr) goto ours; #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) goto ours; #endif } } /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { V_ipstat.ips_cantforward++; m_freem(m); return; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct in_multi *inm; if (V_ip_mrouter) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip_mforward && ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) { V_ipstat.ips_cantforward++; m_freem(m); return; } /* * The process-level routing daemon needs to receive * all multicast IGMP packets, whether or not this * host belongs to their destination groups. */ if (ip->ip_p == IPPROTO_IGMP) goto ours; V_ipstat.ips_forward++; } /* * See if we belong to the destination multicast group on the * arrival interface. */ IN_MULTI_LOCK(); IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm); IN_MULTI_UNLOCK(); if (inm == NULL) { V_ipstat.ips_notmember++; m_freem(m); return; } goto ours; } if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) goto ours; if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; /* * FAITH(Firewall Aided Internet Translator) */ if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { if (V_ip_keepfaith) { if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) goto ours; } m_freem(m); return; } /* * Not for us; forward if possible and desirable. */ if (V_ipforwarding == 0) { V_ipstat.ips_cantforward++; m_freem(m); } else { #ifdef IPSEC if (ip_ipsec_fwd(m)) goto bad; #endif /* IPSEC */ ip_forward(m, dchg); } return; ours: #ifdef IPSTEALTH /* * IPSTEALTH: Process non-routing options only * if the packet is destined for us. */ if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) return; #endif /* IPSTEALTH */ /* Count the packet in the ip address stats */ if (ia != NULL) { ia->ia_ifa.if_ipackets++; ia->ia_ifa.if_ibytes += m->m_pkthdr.len; } /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ if (ip->ip_off & (IP_MF | IP_OFFMASK)) { m = ip_reass(m); if (m == NULL) return; ip = mtod(m, struct ip *); /* Get the header length of the reassembled packet */ hlen = ip->ip_hl << 2; } /* * Further protocols expect the packet length to be w/o the * IP header. */ ip->ip_len -= hlen; #ifdef IPSEC /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if (ip_ipsec_input(m)) goto bad; #endif /* IPSEC */ /* * Switch out to protocol's input routine. */ V_ipstat.ips_delivered++; (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); return; bad: m_freem(m); } /* * After maxnipq has been updated, propagate the change to UMA. The UMA zone * max has slightly different semantics than the sysctl, for historical * reasons. */ static void maxnipq_update(void) { INIT_VNET_INET(curvnet); /* * -1 for unlimited allocation. */ if (V_maxnipq < 0) uma_zone_set_max(V_ipq_zone, 0); /* * Positive number for specific bound. */ if (V_maxnipq > 0) uma_zone_set_max(V_ipq_zone, V_maxnipq); /* * Zero specifies no further fragment queue allocation -- set the * bound very low, but rely on implementation elsewhere to actually * prevent allocation and reclaim current queues. */ if (V_maxnipq == 0) uma_zone_set_max(V_ipq_zone, 1); } static void ipq_zone_change(void *tag) { INIT_VNET_INET(curvnet); if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) { V_maxnipq = nmbclusters / 32; maxnipq_update(); } } static int sysctl_maxnipq(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET(curvnet); int error, i; i = V_maxnipq; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) return (error); /* * XXXRW: Might be a good idea to sanity check the argument and place * an extreme upper bound. */ if (i < -1) return (EINVAL); V_maxnipq = i; maxnipq_update(); return (0); } SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW, NULL, 0, sysctl_maxnipq, "I", "Maximum number of IPv4 fragment reassembly queue entries"); /* * Take incoming datagram fragment and try to reassemble it into * whole datagram. If the argument is the first fragment or one * in between the function will return NULL and store the mbuf * in the fragment chain. If the argument is the last fragment * the packet will be reassembled and the pointer to the new * mbuf returned for further processing. Only m_tags attached * to the first packet/fragment are preserved. * The IP header is *NOT* adjusted out of iplen. */ struct mbuf * ip_reass(struct mbuf *m) { INIT_VNET_INET(curvnet); struct ip *ip; struct mbuf *p, *q, *nq, *t; struct ipq *fp = NULL; struct ipqhead *head; int i, hlen, next; u_int8_t ecn, ecn0; u_short hash; /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { V_ipstat.ips_fragments++; V_ipstat.ips_fragdropped++; m_freem(m); return (NULL); } ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); head = &V_ipq[hash]; IPQ_LOCK(); /* * Look for queue of fragments * of this datagram. */ TAILQ_FOREACH(fp, head, ipq_list) if (ip->ip_id == fp->ipq_id && ip->ip_src.s_addr == fp->ipq_src.s_addr && ip->ip_dst.s_addr == fp->ipq_dst.s_addr && #ifdef MAC mac_ipq_match(m, fp) && #endif ip->ip_p == fp->ipq_p) goto found; fp = NULL; /* * Attempt to trim the number of allocated fragment queues if it * exceeds the administrative limit. */ if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) { /* * drop something from the tail of the current queue * before proceeding further */ struct ipq *q = TAILQ_LAST(head, ipqhead); if (q == NULL) { /* gak */ for (i = 0; i < IPREASS_NHASH; i++) { struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead); if (r) { V_ipstat.ips_fragtimeout += r->ipq_nfrags; ip_freef(&V_ipq[i], r); break; } } } else { V_ipstat.ips_fragtimeout += q->ipq_nfrags; ip_freef(head, q); } } found: /* * Adjust ip_len to not reflect header, * convert offset of this to bytes. */ ip->ip_len -= hlen; if (ip->ip_off & IP_MF) { /* * Make sure that fragments have a data length * that's a non-zero multiple of 8 bytes. */ if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { V_ipstat.ips_toosmall++; /* XXX */ goto dropfrag; } m->m_flags |= M_FRAG; } else m->m_flags &= ~M_FRAG; ip->ip_off <<= 3; /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ V_ipstat.ips_fragments++; m->m_pkthdr.header = ip; /* Previous ip_reass() started here. */ /* * Presence of header sizes in mbufs * would confuse code below. */ m->m_data += hlen; m->m_len -= hlen; /* * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (fp == NULL) goto dropfrag; #ifdef MAC if (mac_ipq_init(fp, M_NOWAIT) != 0) { uma_zfree(V_ipq_zone, fp); fp = NULL; goto dropfrag; } mac_ipq_create(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); V_nipq++; fp->ipq_nfrags = 1; fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; fp->ipq_src = ip->ip_src; fp->ipq_dst = ip->ip_dst; fp->ipq_frags = m; m->m_nextpkt = NULL; goto done; } else { fp->ipq_nfrags++; #ifdef MAC mac_ipq_update(m, fp); #endif } #define GETIP(m) ((struct ip*)((m)->m_pkthdr.header)) /* * Handle ECN by comparing this segment with the first one; * if CE is set, do not lose CE. * drop if CE and not-ECT are mixed for the same packet. */ ecn = ip->ip_tos & IPTOS_ECN_MASK; ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; if (ecn == IPTOS_ECN_CE) { if (ecn0 == IPTOS_ECN_NOTECT) goto dropfrag; if (ecn0 != IPTOS_ECN_CE) GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; } if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) goto dropfrag; /* * Find a segment which begins after this one does. */ for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) if (GETIP(q)->ip_off > ip->ip_off) break; /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us, otherwise * stick new segment in the proper place. * * If some of the data is dropped from the the preceding * segment, then it's checksum is invalidated. */ if (p) { i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off; if (i > 0) { if (i >= ip->ip_len) goto dropfrag; m_adj(m, i); m->m_pkthdr.csum_flags = 0; ip->ip_off += i; ip->ip_len -= i; } m->m_nextpkt = p->m_nextpkt; p->m_nextpkt = m; } else { m->m_nextpkt = fp->ipq_frags; fp->ipq_frags = m; } /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off; q = nq) { i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off; if (i < GETIP(q)->ip_len) { GETIP(q)->ip_len -= i; GETIP(q)->ip_off += i; m_adj(q, i); q->m_pkthdr.csum_flags = 0; break; } nq = q->m_nextpkt; m->m_nextpkt = nq; V_ipstat.ips_fragdropped++; fp->ipq_nfrags--; m_freem(q); } /* * Check for complete reassembly and perform frag per packet * limiting. * * Frag limiting is performed here so that the nth frag has * a chance to complete the packet before we drop the packet. * As a result, n+1 frags are actually allowed per packet, but * only n will ever be stored. (n = maxfragsperpacket.) * */ next = 0; for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (GETIP(q)->ip_off != next) { if (fp->ipq_nfrags > V_maxfragsperpacket) { V_ipstat.ips_fragdropped += fp->ipq_nfrags; ip_freef(head, fp); } goto done; } next += GETIP(q)->ip_len; } /* Make sure the last packet didn't have the IP_MF flag */ if (p->m_flags & M_FRAG) { if (fp->ipq_nfrags > V_maxfragsperpacket) { V_ipstat.ips_fragdropped += fp->ipq_nfrags; ip_freef(head, fp); } goto done; } /* * Reassembly is complete. Make sure the packet is a sane size. */ q = fp->ipq_frags; ip = GETIP(q); if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { V_ipstat.ips_toolong++; V_ipstat.ips_fragdropped += fp->ipq_nfrags; ip_freef(head, fp); goto done; } /* * Concatenate fragments. */ m = q; t = m->m_next; m->m_next = NULL; m_cat(m, t); nq = q->m_nextpkt; q->m_nextpkt = NULL; for (q = nq; q != NULL; q = nq) { nq = q->m_nextpkt; q->m_nextpkt = NULL; m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; m_cat(m, q); } /* * In order to do checksumming faster we do 'end-around carry' here * (and not in for{} loop), though it implies we are not going to * reassemble more than 64k fragments. */ m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); #ifdef MAC mac_ipq_reassemble(fp, m); mac_ipq_destroy(fp); #endif /* * Create header for new ip packet by modifying header of first * packet; dequeue and discard fragment reassembly header. * Make header visible. */ ip->ip_len = (ip->ip_hl << 2) + next; ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; TAILQ_REMOVE(head, fp, ipq_list); V_nipq--; uma_zfree(V_ipq_zone, fp); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); /* some debugging cruft by sklower, below, will go away soon */ if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ m_fixhdr(m); V_ipstat.ips_reassembled++; IPQ_UNLOCK(); return (m); dropfrag: V_ipstat.ips_fragdropped++; if (fp != NULL) fp->ipq_nfrags--; m_freem(m); done: IPQ_UNLOCK(); return (NULL); #undef GETIP } /* * Free a fragment reassembly header and all * associated datagrams. */ static void ip_freef(struct ipqhead *fhp, struct ipq *fp) { INIT_VNET_INET(curvnet); struct mbuf *q; IPQ_LOCK_ASSERT(); while (fp->ipq_frags) { q = fp->ipq_frags; fp->ipq_frags = q->m_nextpkt; m_freem(q); } TAILQ_REMOVE(fhp, fp, ipq_list); uma_zfree(V_ipq_zone, fp); V_nipq--; } /* * IP timer processing; * if a timer expires on a reassembly * queue, discard it. */ void ip_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); struct ipq *fp; int i; IPQ_LOCK(); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INIT_VNET_INET(vnet_iter); for (i = 0; i < IPREASS_NHASH; i++) { for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) { struct ipq *fpp; fpp = fp; fp = TAILQ_NEXT(fp, ipq_list); if(--fpp->ipq_ttl == 0) { V_ipstat.ips_fragtimeout += fpp->ipq_nfrags; ip_freef(&V_ipq[i], fpp); } } } /* * If we are over the maximum number of fragments * (due to the limit being lowered), drain off * enough to get down to the new limit. */ if (V_maxnipq >= 0 && V_nipq > V_maxnipq) { for (i = 0; i < IPREASS_NHASH; i++) { while (V_nipq > V_maxnipq && !TAILQ_EMPTY(&V_ipq[i])) { V_ipstat.ips_fragdropped += TAILQ_FIRST(&V_ipq[i])->ipq_nfrags; ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); } } } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); IPQ_UNLOCK(); } /* * Drain off all datagram fragments. */ void ip_drain(void) { VNET_ITERATOR_DECL(vnet_iter); int i; IPQ_LOCK(); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INIT_VNET_INET(vnet_iter); for (i = 0; i < IPREASS_NHASH; i++) { while(!TAILQ_EMPTY(&V_ipq[i])) { V_ipstat.ips_fragdropped += TAILQ_FIRST(&V_ipq[i])->ipq_nfrags; ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); } } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); IPQ_UNLOCK(); in_rtqdrain(); } /* * The protocol to be inserted into ip_protox[] must be already registered * in inetsw[], either statically or through pf_proto_register(). */ int ipproto_register(u_char ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto == 0) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ return (EEXIST); /* Find the protocol position in inetsw[] and set the index. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol == ipproto) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) { ip_protox[pr->pr_protocol] = pr - inetsw; return (0); } else return (EINVAL); } } return (EPROTONOSUPPORT); } int ipproto_unregister(u_char ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto == 0) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip_protox[ipproto] = pr - inetsw; return (0); } /* * Given address of next destination (final or next hop), * return internet address info of interface to be used to get there. */ struct in_ifaddr * ip_rtaddr(struct in_addr dst, u_int fibnum) { struct route sro; struct sockaddr_in *sin; struct in_ifaddr *ifa; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = dst; in_rtalloc_ign(&sro, RTF_CLONING, fibnum); if (sro.ro_rt == NULL) return (NULL); ifa = ifatoia(sro.ro_rt->rt_ifa); RTFREE(sro.ro_rt); return (ifa); } u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * * The srcrt parameter indicates whether the packet is being forwarded * via a source route. */ void ip_forward(struct mbuf *m, int srcrt) { INIT_VNET_INET(curvnet); struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia = NULL; struct mbuf *mcopy; struct in_addr dest; struct route ro; int error, type = 0, code = 0, mtu = 0; if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { V_ipstat.ips_cantforward++; m_freem(m); return; } #ifdef IPSTEALTH if (!V_ipstealth) { #endif if (ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return; } #ifdef IPSTEALTH } #endif ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m)); if (!srcrt && ia == NULL) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); return; } /* * Save the IP header and at most 8 bytes of the payload, * in case we need to generate an ICMP message to the src. * * XXX this can be optimized a lot by saving the data in a local * buffer on the stack (72 bytes at most), and only allocating the * mbuf if really necessary. The vast majority of the packets * are forwarded without having to send an ICMP back (either * because unnecessary, or because rate limited), so we are * really we are wasting a lot of work here. * * We don't use m_copy() because it might return a reference * to a shared cluster. Both this function and ip_output() * assume exclusive access to the IP header in `m', so any * data in a cluster may change before we reach icmp_error(). */ MGETHDR(mcopy, M_DONTWAIT, m->m_type); if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_DONTWAIT)) { /* * It's probably ok if the pkthdr dup fails (because * the deep copy of the tag chain failed), but for now * be conservative and just discard the copy since * code below may some day want the tags. */ m_free(mcopy); mcopy = NULL; } if (mcopy != NULL) { mcopy->m_len = min(ip->ip_len, M_TRAILINGSPACE(mcopy)); mcopy->m_pkthdr.len = mcopy->m_len; m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); } #ifdef IPSTEALTH if (!V_ipstealth) { #endif ip->ip_ttl -= IPTTLDEC; #ifdef IPSTEALTH } #endif /* * If forwarding packet using same interface that it came in on, * perhaps should send a redirect to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a default route * or a route modified by a redirect. */ dest.s_addr = 0; if (!srcrt && V_ipsendredirects && ia->ia_ifp == m->m_pkthdr.rcvif) { struct sockaddr_in *sin; struct rtentry *rt; bzero(&ro, sizeof(ro)); sin = (struct sockaddr_in *)&ro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; in_rtalloc_ign(&ro, RTF_CLONING, M_GETFIB(m)); rt = ro.ro_rt; if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && satosin(rt_key(rt))->sin_addr.s_addr != 0) { #define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) u_long src = ntohl(ip->ip_src.s_addr); if (RTA(rt) && (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { if (rt->rt_flags & RTF_GATEWAY) dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; else dest.s_addr = ip->ip_dst.s_addr; /* Router requirements says to only send host redirects */ type = ICMP_REDIRECT; code = ICMP_REDIRECT_HOST; } } if (rt) RTFREE(rt); } /* * Try to cache the route MTU from ip_output so we can consider it for * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191. */ bzero(&ro, sizeof(ro)); error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); if (error == EMSGSIZE && ro.ro_rt) mtu = ro.ro_rt->rt_rmx.rmx_mtu; if (ro.ro_rt) RTFREE(ro.ro_rt); if (error) V_ipstat.ips_cantforward++; else { V_ipstat.ips_forward++; if (type) V_ipstat.ips_redirectsent++; else { if (mcopy) m_freem(mcopy); return; } } if (mcopy == NULL) return; switch (error) { case 0: /* forwarded, but need redirect */ /* type, code set above */ break; case ENETUNREACH: /* shouldn't happen, checked above */ case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; break; case EMSGSIZE: type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; #ifdef IPSEC /* * If IPsec is configured for this path, * override any possibly mtu value set by ip_output. */ mtu = ip_ipsec_mtu(m, mtu); #endif /* IPSEC */ /* * If the MTU was set before make sure we are below the * interface MTU. * If the MTU wasn't set before use the interface mtu or * fall back to the next smaller mtu step compared to the * current packet size. */ if (mtu != 0) { if (ia != NULL) mtu = min(mtu, ia->ia_ifp->if_mtu); } else { if (ia != NULL) mtu = ia->ia_ifp->if_mtu; else mtu = ip_next_mtu(ip->ip_len, 0); } V_ipstat.ips_cantfrag++; break; case ENOBUFS: /* * A router should not generate ICMP_SOURCEQUENCH as * required in RFC1812 Requirements for IP Version 4 Routers. * Source quench could be a big problem under DoS attacks, * or if the underlying interface is rate-limited. * Those who need source quench packets may re-enable them * via the net.inet.ip.sendsourcequench sysctl. */ if (V_ip_sendsourcequench == 0) { m_freem(mcopy); return; } else { type = ICMP_SOURCEQUENCH; code = 0; } break; case EACCES: /* ipfw denied packet */ m_freem(mcopy); return; } icmp_error(mcopy, type, code, dest.s_addr, mtu); } void ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { INIT_VNET_NET(inp->inp_vnet); if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) { struct bintime bt; bintime(&bt); if (inp->inp_socket->so_options & SO_BINTIME) { *mp = sbcreatecontrol((caddr_t) &bt, sizeof(bt), SCM_BINTIME, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; bintime2timeval(&bt, &tv); *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } } if (inp->inp_flags & INP_RECVDSTADDR) { *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTTL) { *mp = sbcreatecontrol((caddr_t) &ip->ip_ttl, sizeof(u_char), IP_RECVTTL, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef notyet /* XXX * Moving these out of udp_input() made them even more broken * than they already were. */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { *mp = sbcreatecontrol((caddr_t) opts_deleted_above, sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { *mp = sbcreatecontrol((caddr_t) ip_srcroute(m), sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #endif if (inp->inp_flags & INP_RECVIF) { struct ifnet *ifp; struct sdlbuf { struct sockaddr_dl sdl; u_char pad[32]; } sdlbuf; struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if (((ifp = m->m_pkthdr.rcvif)) && ( ifp->if_index && (ifp->if_index <= V_if_index))) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. */ if ((sdp->sdl_family != AF_LINK) || (sdp->sdl_len > sizeof(sdlbuf))) { goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); } else { makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, IP_RECVIF, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } } /* * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ -static int ip_rsvp_on; -struct socket *ip_rsvpd; int ip_rsvp_init(struct socket *so) { INIT_VNET_INET(so->so_vnet); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; if (V_ip_rsvpd != NULL) return EADDRINUSE; V_ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!V_ip_rsvp_on) { V_ip_rsvp_on = 1; V_rsvp_on++; } return 0; } int ip_rsvp_done(void) { INIT_VNET_INET(curvnet); V_ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (V_ip_rsvp_on) { V_ip_rsvp_on = 0; V_rsvp_on--; } return 0; } void rsvp_input(struct mbuf *m, int off) /* XXX must fixup manually */ { INIT_VNET_INET(curvnet); if (rsvp_input_p) { /* call the real one if loaded */ rsvp_input_p(m, off); return; } /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!V_rsvp_on) { m_freem(m); return; } if (V_ip_rsvpd != NULL) { rip_input(m, off); return; } /* Drop the packet */ m_freem(m); } Index: head/sys/netinet/ip_output.c =================================================================== --- head/sys/netinet/ip_output.c (revision 185087) +++ head/sys/netinet/ip_output.c (revision 185088) @@ -1,1186 +1,1188 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_mbuf_stress_test.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif /* IPSEC*/ #include #include #define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ x, (ntohl(a.s_addr)>>24)&0xFF,\ (ntohl(a.s_addr)>>16)&0xFF,\ (ntohl(a.s_addr)>>8)&0xFF,\ (ntohl(a.s_addr))&0xFF, y); +#ifdef VIMAGE_GLOBALS u_short ip_id; +#endif #ifdef MBUF_STRESS_TEST int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static void ip_mloopback (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); extern struct protosw inetsw[]; /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { INIT_VNET_NET(curvnet); INIT_VNET_INET(curvnet); struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu; int len, error = 0; struct sockaddr_in *dst = NULL; /* keep compiler happy */ struct in_ifaddr *ia = NULL; int isbroadcast, sw_csum; struct route iproute; struct in_addr odst; #ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag = NULL; #endif M_ASSERTPKTHDR(m); if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); } if (inp != NULL) INP_LOCK_ASSERT(inp); if (opt) { len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; } ip = mtod(m, struct ip *); /* * Fill in IP header. If we are not allowing fragmentation, * then the ip_id field is meaningless, but we don't set it * to zero. Doing so causes various problems when devices along * the path (routers, load balancers, firewalls, etc.) illegally * disable DF on our packet. Note that a 16-bit counter * will wrap around in less than 10 seconds at 100 Mbit/s on a * medium with MTU 1500. See Steven M. Bellovin, "A Technique * for Counting NATted Hosts", Proc. IMW'02, available at * . */ if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip->ip_id = ip_newid(); V_ipstat.ips_localout++; } else { hlen = ip->ip_hl << 2; } dst = (struct sockaddr_in *)&ro->ro_dst; again: /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. * The address family should also be checked in case of sharing the * cache with IPv6. */ if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)NULL; } #ifdef IPFIREWALL_FORWARD if (ro->ro_rt == NULL && fwd_tag == NULL) { #else if (ro->ro_rt == NULL) { #endif bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an * interface is specified by the broadcast address of an interface, * or the destination address of a ptp interface. */ if (flags & IP_SENDONES) { if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL && (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { V_ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = 1; } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { V_ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; IFP_TO_IA(ifp, ia); isbroadcast = 0; /* fool gcc */ } else { /* * We want to do any cloning requested by the link layer, * as this is probably required in all cases for correct * operation (as it is for ARP). */ if (ro->ro_rt == NULL) #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); #else in_rtalloc_ign(ro, 0, inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); #endif if (ro->ro_rt == NULL) { V_ipstat.ips_noroute++; error = EHOSTUNREACH; goto bad; } ia = ifatoia(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_rmx.rmx_pksent++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; if (ro->ro_rt->rt_flags & RTF_HOST) isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); else isbroadcast = in_broadcast(dst->sin_addr, ifp); } /* * Calculate MTU. If we have a route that is up, use that, * otherwise use the interface's MTU. */ if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP|RTF_HOST))) { /* * This case can happen if the user changed the MTU * of an interface after enabling IP on it. Because * most netifs don't keep track of routes pointing to * them, there is no way for one to update all its * routes when the MTU is changed. */ if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu) ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; mtu = ro->ro_rt->rt_rmx.rmx_mtu; } else { mtu = ifp->if_mtu; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct in_multi *inm; m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "dst" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ dst = (struct sockaddr_in *)&ro->ro_dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { V_ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) ip->ip_src = IA_SIN(ia)->sin_addr; } IN_MULTI_LOCK(); IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); if (inm != NULL && (imo == NULL || imo->imo_multicast_loop)) { IN_MULTI_UNLOCK(); /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not * forbid loopback, loop back a copy. */ ip_mloopback(ifp, m, dst, hlen); } else { IN_MULTI_UNLOCK(); /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!V_rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) { ip->ip_src = IA_SIN(ia)->sin_addr; } } /* * Verify that we have any chance at all of being able to queue the * packet or packet fragments, unless ALTQ is enabled on the given * interface in which case packetdrop should be done by queueing. */ #ifdef ALTQ if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= ifp->if_snd.ifq_maxlen)) #else if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= ifp->if_snd.ifq_maxlen) #endif /* ALTQ */ { error = ENOBUFS; V_ipstat.ips_odropped++; ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); goto bad; } /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip->ip_len > mtu) { error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #ifdef IPSEC switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) { case 1: goto bad; case -1: goto done; case 0: default: break; /* Continue with packet processing. */ } /* Update variables that are affected by ipsec4_output(). */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&inet_pfil_hook)) goto passout; /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp); if (error != 0 || m == NULL) goto done; ip = mtod(m, struct ip *); /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; error = netisr_queue(NETISR_IP, m); goto done; } else goto again; /* Redo the routing table lookup. */ } #ifdef IPFIREWALL_FORWARD /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; error = netisr_queue(NETISR_IP, m); goto done; } /* Or forward to some other address? */ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (fwd_tag) { dst = (struct sockaddr_in *)&ro->ro_dst; bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m_tag_delete(m, fwd_tag); goto again; } #endif /* IPFIREWALL_FORWARD */ passout: /* 127/8 must not appear on wire - RFC1122. */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { V_ipstat.ips_badaddr++; error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; if (sw_csum & CSUM_DELAY_DATA) { in_delayed_cksum(m); sw_csum &= ~CSUM_DELAY_DATA; } m->m_pkthdr.csum_flags &= ifp->if_hwassist; /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ if (ip->ip_len <= mtu || (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) ip->ip_sum = in_cksum(m, hlen); /* * Record statistics for this interface address. * With CSUM_TSO the byte/packet count will be slightly * incorrect because we count the IP+TCP headers only * once instead of for every generated packet. */ if (!(flags & IP_FORWARDING) && ia) { if (m->m_pkthdr.csum_flags & CSUM_TSO) ia->ia_ifa.if_opackets += m->m_pkthdr.len / m->m_pkthdr.tso_segsz; else ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ m->m_flags &= ~(M_PROTOFLAGS); error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro->ro_rt); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; V_ipstat.ips_cantfrag++; goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ m->m_flags &= ~(M_PROTOFLAGS); error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro->ro_rt); } else m_freem(m); } if (error == 0) V_ipstat.ips_fragmented++; done: if (ro == &iproute && ro->ro_rt) { RTFREE(ro->ro_rt); } return (error); bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags, int sw_csum) { INIT_VNET_INET(curvnet); int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ V_ipstat.ips_cantfrag++; return EMSGSIZE; } /* * Must be able to put at least 8 bytes per fragment. */ if (len < 8) return EMSGSIZE; /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; struct mbuf *m; for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) off += m->m_len; /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; V_ipstat.ips_odropped++; goto done; } m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copy(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip->ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; if (off + len >= ip->ip_len) { /* last fragment */ len = ip->ip_len - off; m->m_flags |= M_LASTFRAG; } else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copy(m0, off, len); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ V_ipstat.ips_odropped++; goto done; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = NULL; #ifdef MAC mac_netinet_fragment(m0, m); #endif m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) mhip->ip_sum = in_cksum(m, mhlen); *mnext = m; mnext = &m->m_nextpkt; } V_ipstat.ips_ofragments += nfrags; /* set first marker for fragment chain */ m0->m_flags |= M_FIRSTFRAG | M_FRAG; m0->m_pkthdr.csum_data = nfrags; /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip->ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off |= IP_MF; ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) ip->ip_sum = in_cksum(m0, hlen); done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; u_short csum, offset; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; csum = in_cksum_skip(m, ip->ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(u_short) > m->m_len) { printf("delayed m_pullup, m->len: %d off: %d p: %d\n", m->m_len, offset, ip->ip_p); /* * XXX * this shouldn't happen, but if it does, the * correct behavior may be to insert the checksum * in the appropriate next mbuf in the chain. */ return; } *(u_short *)(m->m_data + offset) = csum; } /* * IP socket option processing. */ int ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { return (EINVAL); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); if (error) { m_free(m); break; } INP_WLOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); INP_WUNLOCK(inp); return (error); } case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_FAITH: case IP_ONESBCAST: case IP_DONTFRAG: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval > 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_FAITH: OPTSET(INP_FAITH); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; case IP_DONTFRAG: OPTSET(INP_DONTFRAG); break; } break; #undef OPTSET /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(inp); break; #ifdef IPSEC case IP_IPSEC_POLICY: { caddr_t req; struct mbuf *m; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; req = mtod(m, caddr_t); error = ipsec4_set_policy(inp, sopt->sopt_name, req, m->m_len, (sopt->sopt_td != NULL) ? sopt->sopt_td->td_ucred : NULL); m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: if (inp->inp_options) error = sooptcopyout(sopt, mtod(inp->inp_options, char *), inp->inp_options->m_len); else sopt->sopt_valsize = 0; break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_FAITH: case IP_ONESBCAST: case IP_DONTFRAG: switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_FAITH: optval = OPTBIT(INP_FAITH); break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; case IP_DONTFRAG: optval = OPTBIT(INP_DONTFRAG); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_MSFILTER: error = inp_getmoptions(inp, sopt); break; #ifdef IPSEC case IP_IPSEC_POLICY: { struct mbuf *m = NULL; caddr_t req = NULL; size_t len = 0; if (m != 0) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, int hlen) { register struct ip *ip; struct mbuf *copym; /* * Make a deep copy of the packet because we're going to * modify the pack in order to generate checksums. */ copym = m_dup(m, M_DONTWAIT); if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); #if 1 /* XXX */ if (dst->sin_family != AF_INET) { printf("ip_mloopback: bad address family %d\n", dst->sin_family); dst->sin_family = AF_INET; } #endif if_simloop(ifp, copym, dst->sin_family, 0); } } Index: head/sys/netinet/raw_ip.c =================================================================== --- head/sys/netinet/raw_ip.c (revision 185087) +++ head/sys/netinet/raw_ip.c (revision 185088) @@ -1,1006 +1,1010 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #endif /*IPSEC*/ #include +#ifdef VIMAGE_GLOBALS struct inpcbhead ripcb; struct inpcbinfo ripcbinfo; +#endif /* control hooks for ipfw and dummynet */ ip_fw_ctl_t *ip_fw_ctl_ptr = NULL; ip_dn_ctl_t *ip_dn_ctl_ptr = NULL; /* * Hooks for multicast routing. They all default to NULL, so leave them not * initialized and rely on BSS being set to 0. */ /* * The socket used to communicate with the multicast routing daemon. */ +#ifdef VIMAGE_GLOBALS struct socket *ip_mrouter; +#endif /* * The various mrouter and rsvp functions. */ int (*ip_mrouter_set)(struct socket *, struct sockopt *); int (*ip_mrouter_get)(struct socket *, struct sockopt *); int (*ip_mrouter_done)(void); int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); int (*mrt_ioctl)(int, caddr_t, int); int (*legal_vif_num)(int); u_long (*ip_mcast_src)(int); void (*rsvp_input_p)(struct mbuf *m, int off); int (*ip_rsvp_vif)(struct socket *, struct sockopt *); void (*ip_rsvp_force_done)(struct socket *); /* * Hash functions */ #define INP_PCBHASH_RAW_SIZE 256 #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ (((proto) + (laddr) + (faddr)) % (mask) + 1) static void rip_inshash(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *pcbhash; int hash; INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); if (inp->inp_ip_p != 0 && inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != INADDR_ANY) { hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); } else hash = 0; pcbhash = &pcbinfo->ipi_hashbase[hash]; LIST_INSERT_HEAD(pcbhash, inp, inp_hash); } static void rip_delhash(struct inpcb *inp) { INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); LIST_REMOVE(inp, inp_hash); } /* * Raw interface to IP protocol. */ /* * Initialize raw connection block q. */ static void rip_zone_change(void *tag) { INIT_VNET_INET(curvnet); uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); } static int rip_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "rawinp"); return (0); } void rip_init(void) { INIT_VNET_INET(curvnet); INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip"); LIST_INIT(&V_ripcb); V_ripcbinfo.ipi_listhead = &V_ripcb; V_ripcbinfo.ipi_hashbase = hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, &V_ripcbinfo.ipi_hashmask); V_ripcbinfo.ipi_porthashbase = hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask); V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } static int rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, struct sockaddr_in *ripsrc) { int policyfail = 0; INP_RLOCK_ASSERT(last); #ifdef IPSEC /* check AH/ESP integrity. */ if (ipsec4_in_reject(n, last)) { policyfail = 1; } #endif /* IPSEC */ #ifdef MAC if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) policyfail = 1; #endif /* Check the minimum TTL for socket. */ if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) policyfail = 1; if (!policyfail) { struct mbuf *opts = NULL; struct socket *so; so = last->inp_socket; if ((last->inp_flags & INP_CONTROLOPTS) || (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) ip_savecontrol(last, &opts, ip, n); SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)ripsrc, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&so->so_rcv); } else sorwakeup_locked(so); } else m_freem(n); return (policyfail); } /* * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. */ void rip_input(struct mbuf *m, int off) { INIT_VNET_INET(curvnet); struct ip *ip = mtod(m, struct ip *); int proto = ip->ip_p; struct inpcb *inp, *last; struct sockaddr_in ripsrc; int hash; bzero(&ripsrc, sizeof(ripsrc)); ripsrc.sin_len = sizeof(ripsrc); ripsrc.sin_family = AF_INET; ripsrc.sin_addr = ip->ip_src; last = NULL; hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); INP_INFO_RLOCK(&V_ripcbinfo); LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { if (inp->inp_ip_p != proto) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; if (jailed(inp->inp_cred) && (htonl(prison_getip(inp->inp_cred)) != ip->ip_dst.s_addr)) { continue; } if (last) { struct mbuf *n; n = m_copy(m, 0, (int)M_COPYALL); if (n != NULL) (void) rip_append(last, ip, n, &ripsrc); /* XXX count dropped packet */ INP_RUNLOCK(last); } INP_RLOCK(inp); last = inp; } LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { if (inp->inp_ip_p && inp->inp_ip_p != proto) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr && inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; if (jailed(inp->inp_cred) && (htonl(prison_getip(inp->inp_cred)) != ip->ip_dst.s_addr)) { continue; } if (last) { struct mbuf *n; n = m_copy(m, 0, (int)M_COPYALL); if (n != NULL) (void) rip_append(last, ip, n, &ripsrc); /* XXX count dropped packet */ INP_RUNLOCK(last); } INP_RLOCK(inp); last = inp; } INP_INFO_RUNLOCK(&V_ripcbinfo); if (last != NULL) { if (rip_append(last, ip, m, &ripsrc) != 0) V_ipstat.ips_delivered--; INP_RUNLOCK(last); } else { m_freem(m); V_ipstat.ips_noproto++; V_ipstat.ips_delivered--; } } /* * Generate IP header and pass packet to ip_output. Tack on options user may * have setup with control call. */ int rip_output(struct mbuf *m, struct socket *so, u_long dst) { INIT_VNET_INET(so->so_vnet); struct ip *ip; int error; struct inpcb *inp = sotoinpcb(so); int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST; /* * If the user handed us a complete IP packet, use it. Otherwise, * allocate an mbuf for a header and fill it in. */ if ((inp->inp_flags & INP_HDRINCL) == 0) { if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); if (m == NULL) return(ENOBUFS); INP_RLOCK(inp); ip = mtod(m, struct ip *); ip->ip_tos = inp->inp_ip_tos; if (inp->inp_flags & INP_DONTFRAG) ip->ip_off = IP_DF; else ip->ip_off = 0; ip->ip_p = inp->inp_ip_p; ip->ip_len = m->m_pkthdr.len; if (jailed(inp->inp_cred)) ip->ip_src.s_addr = htonl(prison_getip(inp->inp_cred)); else ip->ip_src = inp->inp_laddr; ip->ip_dst.s_addr = dst; ip->ip_ttl = inp->inp_ip_ttl; } else { if (m->m_pkthdr.len > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } INP_RLOCK(inp); ip = mtod(m, struct ip *); if (jailed(inp->inp_cred)) { if (ip->ip_src.s_addr != htonl(prison_getip(inp->inp_cred))) { INP_RUNLOCK(inp); m_freem(m); return (EPERM); } } /* * Don't allow both user specified and setsockopt options, * and don't allow packet length sizes that will crash. */ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || (ip->ip_len > m->m_pkthdr.len) || (ip->ip_len < (ip->ip_hl << 2))) { INP_RUNLOCK(inp); m_freem(m); return (EINVAL); } if (ip->ip_id == 0) ip->ip_id = ip_newid(); /* * XXX prevent ip_output from overwriting header fields. */ flags |= IP_RAWOUTPUT; V_ipstat.ips_rawout++; } if (inp->inp_flags & INP_ONESBCAST) flags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif error = ip_output(m, inp->inp_options, NULL, flags, inp->inp_moptions, inp); INP_RUNLOCK(inp); return (error); } /* * Raw IP socket option processing. * * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could * only be created by a privileged process, and as such, socket option * operations to manage system properties on any raw socket were allowed to * take place without explicit additional access control checks. However, * raw sockets can now also be created in jail(), and therefore explicit * checks are now required. Likewise, raw sockets can be used by a process * after it gives up privilege, so some caution is required. For options * passed down to the IP layer via ip_ctloutput(), checks are assumed to be * performed in ip_ctloutput() and therefore no check occurs here. * Unilaterally checking priv_check() here breaks normal IP socket option * operations on raw sockets. * * When adding new socket options here, make sure to add access control * checks here as necessary. */ int rip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; if (sopt->sopt_level != IPPROTO_IP) return (EINVAL); error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case IP_HDRINCL: optval = inp->inp_flags & INP_HDRINCL; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_FW_ADD: /* ADD actually returns the body... */ case IP_FW_GET: case IP_FW_TABLE_GETSIZE: case IP_FW_TABLE_LIST: case IP_FW_NAT_GET_CONFIG: case IP_FW_NAT_GET_LOG: if (ip_fw_ctl_ptr != NULL) error = ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET_GET: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT; break ; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case IP_HDRINCL: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; if (optval) inp->inp_flags |= INP_HDRINCL; else inp->inp_flags &= ~INP_HDRINCL; break; case IP_FW_ADD: case IP_FW_DEL: case IP_FW_FLUSH: case IP_FW_ZERO: case IP_FW_RESETLOG: case IP_FW_TABLE_ADD: case IP_FW_TABLE_DEL: case IP_FW_TABLE_FLUSH: case IP_FW_NAT_CFG: case IP_FW_NAT_DEL: if (ip_fw_ctl_ptr != NULL) error = ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: case IP_DUMMYNET_FLUSH: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT ; break ; case IP_RSVP_ON: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_init(so); break; case IP_RSVP_OFF: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_done(); break; case IP_RSVP_VIF_ON: case IP_RSVP_VIF_OFF: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_vif ? ip_rsvp_vif(so, sopt) : EINVAL; break; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; } return (error); } /* * This function exists solely to receive the PRC_IFDOWN messages which are * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls * in_ifadown() to remove all routes corresponding to that address. It also * receives the PRC_IFUP messages from if_up() and reinstalls the interface * routes. */ void rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) { INIT_VNET_INET(curvnet); struct in_ifaddr *ia; struct ifnet *ifp; int err; int flags; switch (cmd) { case PRC_IFDOWN: TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa && (ia->ia_flags & IFA_ROUTE)) { /* * in_ifscrub kills the interface route. */ in_ifscrub(ia->ia_ifp, ia); /* * in_ifadown gets rid of all the rest of the * routes. This is not quite the right thing * to do, but at least if we are running a * routing process they will come back. */ in_ifadown(&ia->ia_ifa, 0); break; } } break; case PRC_IFUP: TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa) break; } if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) return; flags = RTF_UP; ifp = ia->ia_ifa.ifa_ifp; if ((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_flags & IFF_POINTOPOINT)) flags |= RTF_HOST; err = rtinit(&ia->ia_ifa, RTM_ADD, flags); if (err == 0) ia->ia_flags |= IFA_ROUTE; break; } } u_long rip_sendspace = 9216; u_long rip_recvspace = 9216; SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); static int rip_attach(struct socket *so, int proto, struct thread *td) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("rip_attach: inp != NULL")); error = priv_check(td, PRIV_NETINET_RAW); if (error) return (error); if (proto >= IPPROTO_MAX || proto < 0) return EPROTONOSUPPORT; error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return (error); INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); if (error) { INP_INFO_WUNLOCK(&V_ripcbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; inp->inp_ip_ttl = V_ip_defttl; rip_inshash(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); INP_WUNLOCK(inp); return (0); } static void rip_detach(struct socket *so) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("rip_detach: not closed")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); if (so == V_ip_mrouter && ip_mrouter_done) ip_mrouter_done(); if (ip_rsvp_force_done) ip_rsvp_force_done(so); if (so == V_ip_rsvpd) ip_rsvp_done(); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); } static void rip_dodisconnect(struct socket *so, struct inpcb *inp) { INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); rip_delhash(inp); inp->inp_faddr.s_addr = INADDR_ANY; rip_inshash(inp); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); } static void rip_abort(struct socket *so) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_abort: inp == NULL")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_dodisconnect(so, inp); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); } static void rip_close(struct socket *so) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_close: inp == NULL")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_dodisconnect(so, inp); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); } static int rip_disconnect(struct socket *so) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_dodisconnect(so, inp); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { INIT_VNET_NET(so->so_vnet); INIT_VNET_INET(so->so_vnet); struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return (EINVAL); if (jailed(td->td_ucred)) { if (addr->sin_addr.s_addr == INADDR_ANY) addr->sin_addr.s_addr = htonl(prison_getip(td->td_ucred)); if (htonl(prison_getip(td->td_ucred)) != addr->sin_addr.s_addr) return (EADDRNOTAVAIL); } if (TAILQ_EMPTY(&V_ifnet) || (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || (addr->sin_addr.s_addr && ifa_ifwithaddr((struct sockaddr *)addr) == 0)) return (EADDRNOTAVAIL); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_bind: inp == NULL")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_laddr = addr->sin_addr; rip_inshash(inp); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { INIT_VNET_NET(so->so_vnet); INIT_VNET_INET(so->so_vnet); struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return (EINVAL); if (TAILQ_EMPTY(&V_ifnet)) return (EADDRNOTAVAIL); if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) return (EAFNOSUPPORT); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_connect: inp == NULL")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_faddr = addr->sin_addr; rip_inshash(inp); soisconnected(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } static int rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp; u_long dst; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_send: inp == NULL")); /* * Note: 'dst' reads below are unlocked. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { m_freem(m); return (EISCONN); } dst = inp->inp_faddr.s_addr; /* Unlocked read. */ } else { if (nam == NULL) { m_freem(m); return (ENOTCONN); } dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; } return (rip_output(m, so, dst)); } static int rip_pcblist(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET(curvnet); int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_ripcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return (0); } if (req->newptr != 0) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&V_ripcbinfo); gencnt = V_ripcbinfo.ipi_gencnt; n = V_ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_ripcbinfo); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return (ENOMEM); INP_INFO_RLOCK(&V_ripcbinfo); for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { /* XXX held references? */ inp_list[i++] = inp; } INP_RUNLOCK(inp); } INP_INFO_RUNLOCK(&V_ripcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ INP_INFO_RLOCK(&V_ripcbinfo); xig.xig_gen = V_ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_ripcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0, rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); struct pr_usrreqs rip_usrreqs = { .pru_abort = rip_abort, .pru_attach = rip_attach, .pru_bind = rip_bind, .pru_connect = rip_connect, .pru_control = in_control, .pru_detach = rip_detach, .pru_disconnect = rip_disconnect, .pru_peeraddr = in_getpeeraddr, .pru_send = rip_send, .pru_shutdown = rip_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = rip_close, }; Index: head/sys/netinet/tcp_hostcache.c =================================================================== --- head/sys/netinet/tcp_hostcache.c (revision 185087) +++ head/sys/netinet/tcp_hostcache.c (revision 185088) @@ -1,704 +1,706 @@ /*- * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The tcp_hostcache moves the tcp-specific cached metrics from the routing * table to a dedicated structure indexed by the remote IP address. It keeps * information on the measured TCP parameters of past TCP sessions to allow * better initial start values to be used with later connections to/from the * same source. Depending on the network parameters (delay, bandwidth, max * MTU, congestion window) between local and remote sites, this can lead to * significant speed-ups for new TCP connections after the first one. * * Due to the tcp_hostcache, all TCP-specific metrics information in the * routing table have been removed. The inpcb no longer keeps a pointer to * the routing entry, and protocol-initiated route cloning has been removed * as well. With these changes, the routing table has gone back to being * more lightwight and only carries information related to packet forwarding. * * tcp_hostcache is designed for multiple concurrent access in SMP * environments and high contention. All bucket rows have their own lock and * thus multiple lookups and modifies can be done at the same time as long as * they are in different bucket rows. If a request for insertion of a new * record can't be satisfied, it simply returns an empty structure. Nobody * and nothing outside of tcp_hostcache.c will ever point directly to any * entry in the tcp_hostcache. All communication is done in an * object-oriented way and only functions of tcp_hostcache will manipulate * hostcache entries. Otherwise, we are unable to achieve good behaviour in * concurrent access situations. Since tcp_hostcache is only caching * information, there are no fatal consequences if we either can't satisfy * any particular request or have to drop/overwrite an existing entry because * of bucket limit memory constrains. */ /* * Many thanks to jlemon for basic structure of tcp_syncache which is being * followed here. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #include #include #ifdef INET6 #include #endif #include TAILQ_HEAD(hc_qhead, hc_metrics); struct hc_head { struct hc_qhead hch_bucket; u_int hch_length; struct mtx hch_mtx; }; struct hc_metrics { /* housekeeping */ TAILQ_ENTRY(hc_metrics) rmx_q; struct hc_head *rmx_head; /* head of bucket tail queue */ struct in_addr ip4; /* IP address */ struct in6_addr ip6; /* IP6 address */ /* endpoint specific values for TCP */ u_long rmx_mtu; /* MTU for this path */ u_long rmx_ssthresh; /* outbound gateway buffer limit */ u_long rmx_rtt; /* estimated round trip time */ u_long rmx_rttvar; /* estimated rtt variance */ u_long rmx_bandwidth; /* estimated bandwidth */ u_long rmx_cwnd; /* congestion window */ u_long rmx_sendpipe; /* outbound delay-bandwidth product */ u_long rmx_recvpipe; /* inbound delay-bandwidth product */ /* TCP hostcache internal data */ int rmx_expire; /* lifetime for object */ u_long rmx_hits; /* number of hits */ u_long rmx_updates; /* number of updates */ }; /* Arbitrary values */ #define TCP_HOSTCACHE_HASHSIZE 512 #define TCP_HOSTCACHE_BUCKETLIMIT 30 #define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */ #define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */ struct tcp_hostcache { struct hc_head *hashbase; uma_zone_t zone; u_int hashsize; u_int hashmask; u_int bucket_limit; u_int cache_count; u_int cache_limit; int expire; int prune; int purgeall; }; -static struct tcp_hostcache tcp_hostcache; +#ifdef VIMAGE_GLOBALS +static struct tcp_hostcache tcp_hostcache; static struct callout tcp_hc_callout; +#endif static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *); static struct hc_metrics *tcp_hc_insert(struct in_conninfo *); static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS); static void tcp_hc_purge(void *); SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, "TCP Host cache"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, tcp_hostcache.cache_limit, 0, "Overall entry limit for hostcache"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN, tcp_hostcache.hashsize, 0, "Size of TCP hostcache hashtable"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN, tcp_hostcache.bucket_limit, 0, "Per-bucket hash limit for hostcache"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD, tcp_hostcache.cache_count, 0, "Current number of entries in hostcache"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW, tcp_hostcache.expire, 0, "Expire time of TCP hostcache entries"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_RW, tcp_hostcache.prune, 0, "Time between purge runs"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW, tcp_hostcache.purgeall, 0, "Expire all entires on next purge run"); SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0, sysctl_tcp_hc_list, "A", "List of all hostcache entries"); static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache"); #define HOSTCACHE_HASH(ip) \ (((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) & \ V_tcp_hostcache.hashmask) /* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */ #define HOSTCACHE_HASH6(ip6) \ (((ip6)->s6_addr32[0] ^ \ (ip6)->s6_addr32[1] ^ \ (ip6)->s6_addr32[2] ^ \ (ip6)->s6_addr32[3]) & \ V_tcp_hostcache.hashmask) #define THC_LOCK(lp) mtx_lock(lp) #define THC_UNLOCK(lp) mtx_unlock(lp) void tcp_hc_init(void) { INIT_VNET_INET(curvnet); int i; /* * Initialize hostcache structures. */ V_tcp_hostcache.cache_count = 0; V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT; V_tcp_hostcache.cache_limit = V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit; V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE; V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE; TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize", &V_tcp_hostcache.hashsize); TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit", &V_tcp_hostcache.cache_limit); TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit", &V_tcp_hostcache.bucket_limit); if (!powerof2(V_tcp_hostcache.hashsize)) { printf("WARNING: hostcache hash size is not a power of 2.\n"); V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */ } V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1; /* * Allocate the hash table. */ V_tcp_hostcache.hashbase = (struct hc_head *) malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head), M_HOSTCACHE, M_WAITOK | M_ZERO); /* * Initialize the hash buckets. */ for (i = 0; i < V_tcp_hostcache.hashsize; i++) { TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket); V_tcp_hostcache.hashbase[i].hch_length = 0; mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry", NULL, MTX_DEF); } /* * Allocate the hostcache entries. */ V_tcp_hostcache.zone = uma_zcreate("hostcache", sizeof(struct hc_metrics), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit); /* * Set up periodic cache cleanup. */ callout_init(&V_tcp_hc_callout, CALLOUT_MPSAFE); callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, tcp_hc_purge, 0); } /* * Internal function: look up an entry in the hostcache or return NULL. * * If an entry has been returned, the caller becomes responsible for * unlocking the bucket row after he is done reading/modifying the entry. */ static struct hc_metrics * tcp_hc_lookup(struct in_conninfo *inc) { INIT_VNET_INET(curvnet); int hash; struct hc_head *hc_head; struct hc_metrics *hc_entry; KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer")); /* * Hash the foreign ip address. */ if (inc->inc_isipv6) hash = HOSTCACHE_HASH6(&inc->inc6_faddr); else hash = HOSTCACHE_HASH(&inc->inc_faddr); hc_head = &V_tcp_hostcache.hashbase[hash]; /* * Acquire lock for this bucket row; we release the lock if we don't * find an entry, otherwise the caller has to unlock after he is * done. */ THC_LOCK(&hc_head->hch_mtx); /* * Iterate through entries in bucket row looking for a match. */ TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) { if (inc->inc_isipv6) { if (memcmp(&inc->inc6_faddr, &hc_entry->ip6, sizeof(inc->inc6_faddr)) == 0) return hc_entry; } else { if (memcmp(&inc->inc_faddr, &hc_entry->ip4, sizeof(inc->inc_faddr)) == 0) return hc_entry; } } /* * We were unsuccessful and didn't find anything. */ THC_UNLOCK(&hc_head->hch_mtx); return NULL; } /* * Internal function: insert an entry into the hostcache or return NULL if * unable to allocate a new one. * * If an entry has been returned, the caller becomes responsible for * unlocking the bucket row after he is done reading/modifying the entry. */ static struct hc_metrics * tcp_hc_insert(struct in_conninfo *inc) { INIT_VNET_INET(curvnet); int hash; struct hc_head *hc_head; struct hc_metrics *hc_entry; KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer")); /* * Hash the foreign ip address. */ if (inc->inc_isipv6) hash = HOSTCACHE_HASH6(&inc->inc6_faddr); else hash = HOSTCACHE_HASH(&inc->inc_faddr); hc_head = &V_tcp_hostcache.hashbase[hash]; /* * Acquire lock for this bucket row; we release the lock if we don't * find an entry, otherwise the caller has to unlock after he is * done. */ THC_LOCK(&hc_head->hch_mtx); /* * If the bucket limit is reached, reuse the least-used element. */ if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit || V_tcp_hostcache.cache_count >= V_tcp_hostcache.cache_limit) { hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead); /* * At first we were dropping the last element, just to * reacquire it in the next two lines again, which isn't very * efficient. Instead just reuse the least used element. * We may drop something that is still "in-use" but we can be * "lossy". * Just give up if this bucket row is empty and we don't have * anything to replace. */ if (hc_entry == NULL) { THC_UNLOCK(&hc_head->hch_mtx); return NULL; } TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q); V_tcp_hostcache.hashbase[hash].hch_length--; V_tcp_hostcache.cache_count--; V_tcpstat.tcps_hc_bucketoverflow++; #if 0 uma_zfree(V_tcp_hostcache.zone, hc_entry); #endif } else { /* * Allocate a new entry, or balk if not possible. */ hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT); if (hc_entry == NULL) { THC_UNLOCK(&hc_head->hch_mtx); return NULL; } } /* * Initialize basic information of hostcache entry. */ bzero(hc_entry, sizeof(*hc_entry)); if (inc->inc_isipv6) bcopy(&inc->inc6_faddr, &hc_entry->ip6, sizeof(hc_entry->ip6)); else hc_entry->ip4 = inc->inc_faddr; hc_entry->rmx_head = hc_head; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* * Put it upfront. */ TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q); V_tcp_hostcache.hashbase[hash].hch_length++; V_tcp_hostcache.cache_count++; V_tcpstat.tcps_hc_added++; return hc_entry; } /* * External function: look up an entry in the hostcache and fill out the * supplied TCP metrics structure. Fills in NULL when no entry was found or * a value is not set. */ void tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite) { INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry; /* * Find the right bucket. */ hc_entry = tcp_hc_lookup(inc); /* * If we don't have an existing object. */ if (hc_entry == NULL) { bzero(hc_metrics_lite, sizeof(*hc_metrics_lite)); return; } hc_entry->rmx_hits++; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu; hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh; hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt; hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar; hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth; hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd; hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe; hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe; /* * Unlock bucket row. */ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); } /* * External function: look up an entry in the hostcache and return the * discovered path MTU. Returns NULL if no entry is found or value is not * set. */ u_long tcp_hc_getmtu(struct in_conninfo *inc) { INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry; u_long mtu; hc_entry = tcp_hc_lookup(inc); if (hc_entry == NULL) { return 0; } hc_entry->rmx_hits++; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ mtu = hc_entry->rmx_mtu; THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); return mtu; } /* * External function: update the MTU value of an entry in the hostcache. * Creates a new entry if none was found. */ void tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu) { INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry; /* * Find the right bucket. */ hc_entry = tcp_hc_lookup(inc); /* * If we don't have an existing object, try to insert a new one. */ if (hc_entry == NULL) { hc_entry = tcp_hc_insert(inc); if (hc_entry == NULL) return; } hc_entry->rmx_updates++; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ hc_entry->rmx_mtu = mtu; /* * Put it upfront so we find it faster next time. */ TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); /* * Unlock bucket row. */ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); } /* * External function: update the TCP metrics of an entry in the hostcache. * Creates a new entry if none was found. */ void tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) { INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry; hc_entry = tcp_hc_lookup(inc); if (hc_entry == NULL) { hc_entry = tcp_hc_insert(inc); if (hc_entry == NULL) return; } hc_entry->rmx_updates++; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ if (hcml->rmx_rtt != 0) { if (hc_entry->rmx_rtt == 0) hc_entry->rmx_rtt = hcml->rmx_rtt; else hc_entry->rmx_rtt = (hc_entry->rmx_rtt + hcml->rmx_rtt) / 2; V_tcpstat.tcps_cachedrtt++; } if (hcml->rmx_rttvar != 0) { if (hc_entry->rmx_rttvar == 0) hc_entry->rmx_rttvar = hcml->rmx_rttvar; else hc_entry->rmx_rttvar = (hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2; V_tcpstat.tcps_cachedrttvar++; } if (hcml->rmx_ssthresh != 0) { if (hc_entry->rmx_ssthresh == 0) hc_entry->rmx_ssthresh = hcml->rmx_ssthresh; else hc_entry->rmx_ssthresh = (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2; V_tcpstat.tcps_cachedssthresh++; } if (hcml->rmx_bandwidth != 0) { if (hc_entry->rmx_bandwidth == 0) hc_entry->rmx_bandwidth = hcml->rmx_bandwidth; else hc_entry->rmx_bandwidth = (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2; /* V_tcpstat.tcps_cachedbandwidth++; */ } if (hcml->rmx_cwnd != 0) { if (hc_entry->rmx_cwnd == 0) hc_entry->rmx_cwnd = hcml->rmx_cwnd; else hc_entry->rmx_cwnd = (hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2; /* V_tcpstat.tcps_cachedcwnd++; */ } if (hcml->rmx_sendpipe != 0) { if (hc_entry->rmx_sendpipe == 0) hc_entry->rmx_sendpipe = hcml->rmx_sendpipe; else hc_entry->rmx_sendpipe = (hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2; /* V_tcpstat.tcps_cachedsendpipe++; */ } if (hcml->rmx_recvpipe != 0) { if (hc_entry->rmx_recvpipe == 0) hc_entry->rmx_recvpipe = hcml->rmx_recvpipe; else hc_entry->rmx_recvpipe = (hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2; /* V_tcpstat.tcps_cachedrecvpipe++; */ } TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); } /* * Sysctl function: prints the list and values of all hostcache entries in * unsorted order. */ static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET(curvnet); int bufsize; int linesize = 128; char *p, *buf; int len, i, error; struct hc_metrics *hc_entry; #ifdef INET6 char ip6buf[INET6_ADDRSTRLEN]; #endif bufsize = linesize * (V_tcp_hostcache.cache_count + 1); p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO); len = snprintf(p, linesize, "\nIP address MTU SSTRESH RTT RTTVAR BANDWIDTH " " CWND SENDPIPE RECVPIPE HITS UPD EXP\n"); p += len; #define msec(u) (((u) + 500) / 1000) for (i = 0; i < V_tcp_hostcache.hashsize; i++) { THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q) { len = snprintf(p, linesize, "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu " "%4lu %4lu %4i\n", hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) : #ifdef INET6 ip6_sprintf(ip6buf, &hc_entry->ip6), #else "IPv6?", #endif hc_entry->rmx_mtu, hc_entry->rmx_ssthresh, msec(hc_entry->rmx_rtt * (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))), msec(hc_entry->rmx_rttvar * (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))), hc_entry->rmx_bandwidth * 8, hc_entry->rmx_cwnd, hc_entry->rmx_sendpipe, hc_entry->rmx_recvpipe, hc_entry->rmx_hits, hc_entry->rmx_updates, hc_entry->rmx_expire); p += len; } THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); } #undef msec error = SYSCTL_OUT(req, buf, p - buf); free(buf, M_TEMP); return(error); } /* * Expire and purge (old|all) entries in the tcp_hostcache. Runs * periodically from the callout. */ static void tcp_hc_purge(void *arg) { INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry, *hc_next; int all = (intptr_t)arg; int i; if (V_tcp_hostcache.purgeall) { all = 1; V_tcp_hostcache.purgeall = 0; } for (i = 0; i < V_tcp_hostcache.hashsize; i++) { THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); TAILQ_FOREACH_SAFE(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q, hc_next) { if (all || hc_entry->rmx_expire <= 0) { TAILQ_REMOVE(&V_tcp_hostcache.hashbase[i].hch_bucket, hc_entry, rmx_q); uma_zfree(V_tcp_hostcache.zone, hc_entry); V_tcp_hostcache.hashbase[i].hch_length--; V_tcp_hostcache.cache_count--; } else hc_entry->rmx_expire -= V_tcp_hostcache.prune; } THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); } callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, tcp_hc_purge, arg); } Index: head/sys/netinet/tcp_input.c =================================================================== --- head/sys/netinet/tcp_input.c (revision 185087) +++ head/sys/netinet/tcp_input.c (revision 185088) @@ -1,3180 +1,3185 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" /* for ipfw_fwd */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef IPSEC #include #include #endif /*IPSEC*/ #include #include static const int tcprexmtthresh = 3; +#ifdef VIMAGE_GLOBALS struct tcpstat tcpstat; +int blackhole; +int tcp_delack_enabled; +int drop_synfin; +int tcp_do_rfc3042; +int tcp_do_rfc3390; +int tcp_do_ecn; +int tcp_ecn_maxretries; +int tcp_insecure_rst; +int tcp_do_autorcvbuf; +int tcp_autorcvbuf_inc; +int tcp_autorcvbuf_max; +#endif + SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); int tcp_log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports"); -static int blackhole = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, blackhole, 0, "Do not send RST on segments to closed ports"); -int tcp_delack_enabled = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, tcp_delack_enabled, 0, "Delay ACK to try and piggyback it onto a data packet"); -static int drop_synfin = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); -static int tcp_do_rfc3042 = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); -static int tcp_do_rfc3390 = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, tcp_do_rfc3390, 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); -int tcp_do_ecn = 0; -int tcp_ecn_maxretries = 1; SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW, tcp_do_ecn, 0, "TCP ECN support"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW, tcp_ecn_maxretries, 0, "Max retries before giving up on ECN"); -static int tcp_insecure_rst = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, tcp_insecure_rst, 0, "Follow the old (insecure) criteria for accepting RST packets"); -int tcp_do_autorcvbuf = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing"); -int tcp_autorcvbuf_inc = 16*1024; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer"); -int tcp_autorcvbuf_max = 256*1024; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer"); +#ifdef VIMAGE_GLOBALS struct inpcbhead tcb; -#define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; +#endif +#define tcb6 tcb /* for KAME src sync over BSD*'s */ static void tcp_dooptions(struct tcpopt *, u_char *, int, int); static void tcp_do_segment(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); static void inline tcp_congestion_exp(struct tcpcb *); static void inline tcp_congestion_exp(struct tcpcb *tp) { u_int win; win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; ENTER_FASTRECOVERY(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) tp->t_flags |= TF_ECN_SND_CWR; } /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 #define ND6_HINT(tp) \ do { \ if ((tp) && (tp)->t_inpcb && \ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ nd6_nud_hint(NULL, NULL, 0); \ } while (0) #else #define ND6_HINT(tp) #endif /* * Indicate whether this ack should be delayed. We can delay the ack if * - there is no delayed ack timer in progress and * - our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window and * - delayed acks are enabled or * - this is a half-synchronized T/TCP connection. */ #define DELAY_ACK(tp) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) /* * TCP input handling is split into multiple parts: * tcp6_input is a thin wrapper around tcp_input for the extended * ip6_protox[] call format in ip6_input * tcp_input handles primary segment validation, inpcb lookup and * SYN processing on listen sockets * tcp_do_segment processes the ACK and text of the segment for * establishing, established and closing connections */ #ifdef INET6 int tcp6_input(struct mbuf **mp, int *offp, int proto) { INIT_VNET_INET6(curvnet); struct mbuf *m = *mp; struct in6_ifaddr *ia6; IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ia6 = ip6_getdstifaddr(m); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); return IPPROTO_DONE; } tcp_input(m, *offp); return IPPROTO_DONE; } #endif void tcp_input(struct mbuf *m, int off0) { INIT_VNET_INET(curvnet); #ifdef INET6 INIT_VNET_INET6(curvnet); #endif #ifdef IPSEC INIT_VNET_IPSEC(curvnet); #endif struct tcphdr *th; struct ip *ip = NULL; struct ipovly *ipov; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct socket *so = NULL; u_char *optp = NULL; int optlen = 0; int len, tlen, off; int drop_hdrlen; int thflags; int rstreason = 0; /* For badport_bandlim accounting purposes */ uint8_t iptos; #ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag; #endif #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; #else const void *ip6 = NULL; const int isipv6 = 0; #endif struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif to.to_flags = 0; V_tcpstat.tcps_rcvtotal++; if (isipv6) { #ifdef INET6 /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ ip6 = mtod(m, struct ip6_hdr *); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { V_tcpstat.tcps_rcvbadsum++; goto drop; } th = (struct tcphdr *)((caddr_t)ip6 + off0); /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } #else th = NULL; /* XXX: Avoid compiler warning. */ #endif } else { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { V_tcpstat.tcps_rcvshort++; return; } } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ip->ip_len; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + ip->ip_len + IPPROTO_TCP)); th->th_sum ^= 0xffff; #ifdef TCPDEBUG ipov->ih_len = (u_short)tlen; ipov->ih_len = htons(ipov->ih_len); #endif } else { /* * Checksum extended TCP header and data. */ len = sizeof (struct ip) + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = (u_short)tlen; ipov->ih_len = htons(ipov->ih_len); th->th_sum = in_cksum(m, len); } if (th->th_sum) { V_tcpstat.tcps_rcvbadsum++; goto drop; } /* Re-initialization for later version check */ ip->ip_v = IPVERSION; } #ifdef INET6 if (isipv6) iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; else #endif iptos = ip->ip_tos; /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { V_tcpstat.tcps_rcvbadoff++; goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { if (isipv6) { #ifdef INET6 IP6_EXTHDR_CHECK(m, off0, off, ); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); #endif } else { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { V_tcpstat.tcps_rcvshort++; return; } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); } } optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = th->th_flags; /* * Convert TCP protocol specific fields to host format. */ th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. */ drop_hdrlen = off0 + off; /* * Locate pcb for segment. */ INP_INFO_WLOCK(&V_tcbinfo); findpcb: INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #ifdef IPFIREWALL_FORWARD /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */ struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag+1); /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_hash(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); if (!inp) { /* It's new. Try to find the ambushing socket. */ inp = in_pcblookup_hash(&V_tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); } else #endif /* IPFIREWALL_FORWARD */ { if (isipv6) { #ifdef INET6 inp = in6_pcblookup_hash(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif); #endif } else inp = in_pcblookup_hash(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif); } /* * If the INPCB does not exist then all data in the incoming * segment is discarded and an appropriate RST is sent back. * XXX MRT Send RST using which routing table? */ if (inp == NULL) { /* * Log communication attempts to ports that are not * in use. */ if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || tcp_log_in_vain == 2) { if ((s = tcp_log_addrs(NULL, th, (void *)ip, ip6))) log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); } /* * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ if ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole == 2) goto dropunlock; rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_WLOCK(inp); #ifdef IPSEC #ifdef INET6 if (isipv6 && ipsec6_in_reject(m, inp)) { V_ipsec6stat.in_polvio++; goto dropunlock; } else #endif /* INET6 */ if (ipsec4_in_reject(m, inp) != 0) { V_ipsec4stat.in_polvio++; goto dropunlock; } #endif /* IPSEC */ /* * Check the minimum TTL for socket. */ if (inp->inp_ip_minttl != 0) { #ifdef INET6 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) goto dropunlock; else #endif if (inp->inp_ip_minttl > ip->ip_ttl) goto dropunlock; } /* * A previous connection in TIMEWAIT state is supposed to catch * stray or duplicate segments arriving late. If this segment * was a legitimate new connection attempt the old INPCB gets * removed and we can try again to find a listening socket. */ if (inp->inp_vflag & INP_TIMEWAIT) { if (thflags & TH_SYN) tcp_dooptions(&to, optp, optlen, TO_SYN); /* * NB: tcp_twcheck unlocks the INP and frees the mbuf. */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; INP_INFO_WUNLOCK(&V_tcbinfo); return; } /* * The TCPCB may no longer exist if the connection is winding * down or it is in the CLOSED state. Either way we drop the * segment and send an appropriate response. */ tp = intotcpcb(inp); if (tp == NULL || tp->t_state == TCPS_CLOSED) { rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } #ifdef MAC INP_WLOCK_ASSERT(inp); if (mac_inpcb_check_deliver(inp, m)) goto dropunlock; #endif so = inp->inp_socket; KASSERT(so != NULL, ("%s: so == NULL", __func__)); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; if (isipv6) { #ifdef INET6 bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); #endif } else bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection * attempt or the completion of a previous one. */ if (so->so_options & SO_ACCEPTCONN) { struct in_conninfo inc; KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " "tp not listening", __func__)); bzero(&inc, sizeof(inc)); inc.inc_isipv6 = isipv6; #ifdef INET6 if (isipv6) { inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else #endif { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; /* * Check for an existing connection attempt in syncache if * the flag is only ACK. A successful lookup creates a new * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { /* * Parse the TCP options here because * syncookies need access to the reflected * timestamp. */ tcp_dooptions(&to, optp, optlen, 0); /* * NB: syncache_expand() doesn't unlock * inp and tcpinfo locks. */ if (!syncache_expand(&inc, &to, th, &so, m)) { /* * No syncache entry or ACK was not * for our SYN/ACK. Send a RST. * NB: syncache did its own logging * of the failure cause. */ rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (so == NULL) { /* * We completed the 3-way handshake * but could not allocate a socket * either due to memory shortage, * listen queue length limits or * global socket limits. Send RST * or wait and have the remote end * retransmit the ACK for another * try. */ if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Socket allocation failed due to " "limits or memory shortage, %s\n", s, __func__, V_tcp_sc_rst_sock_fail ? "sending RST" : "try again"); if (V_tcp_sc_rst_sock_fail) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } else goto dropunlock; } /* * Socket is created in state SYN_RECEIVED. * Unlock the listen socket, lock the newly * created socket and update the tp variable. */ INP_WUNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); INP_WLOCK(inp); /* new connection */ tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); /* * Process the segment and the data it * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return; } /* * Segment flag validation for new connection attempts: * * Our (SYN|ACK) response was rejected. * Check with syncache and remove entry to prevent * retransmits. * * NB: syncache_chkrst does its own logging of failure * causes. */ if (thflags & TH_RST) { syncache_chkrst(&inc, th); goto dropunlock; } /* * We can't do anything without SYN. */ if ((thflags & TH_SYN) == 0) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN is missing, segment ignored\n", s, __func__); V_tcpstat.tcps_badsyn++; goto dropunlock; } /* * (SYN|ACK) is bogus on a listen socket. */ if (thflags & TH_ACK) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|ACK invalid, segment rejected\n", s, __func__); syncache_badack(&inc); /* XXX: Not needed! */ V_tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } /* * If the drop_synfin option is enabled, drop all * segments with both the SYN and FIN bits set. * This prevents e.g. nmap from identifying the * TCP/IP stack. * XXX: Poor reasoning. nmap has other methods * and is constantly refining its stack detection * strategies. * XXX: This is a violation of the TCP specification * and was used by RFC1644. */ if ((thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); V_tcpstat.tcps_badsyn++; goto dropunlock; } /* * Segment's flags are (SYN) or (SYN|FIN). * * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored * as they do not affect the state of the TCP FSM. * The data pointed to by TH_URG and th_urp is ignored. */ KASSERT((thflags & (TH_RST|TH_ACK)) == 0, ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); KASSERT(thflags & (TH_SYN), ("%s: Listen socket: TH_SYN not set", __func__)); #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !V_ip6_use_deprecated) { struct in6_ifaddr *ia6; if ((ia6 = ip6_getdstifaddr(m)) && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to deprecated " "IPv6 address rejected\n", s, __func__); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } } #endif /* * Basic sanity checks on incoming SYN requests: * Don't respond if the destination is a link layer * broadcast according to RFC1122 4.2.3.10, p. 104. * If it is from this socket it must be forged. * Don't respond if the source or destination is a * global or subnet broad- or multicast address. * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from broad- or multicast " "link layer address ignored\n", s, __func__); goto dropunlock; } if (isipv6) { #ifdef INET6 if (th->th_dport == th->th_sport && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to/from self " "ignored\n", s, __func__); goto dropunlock; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to multicast " "address ignored\n", s, __func__); goto dropunlock; } #endif } else { if (th->th_dport == th->th_sport && ip->ip_dst.s_addr == ip->ip_src.s_addr) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to self " "ignored\n", s, __func__); goto dropunlock; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to broad- " "or multicast address ignored\n", s, __func__); goto dropunlock; } } /* * SYN appears to be valid. Create compressed TCP state * for syncache. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif tcp_dooptions(&to, optp, optlen, TO_SYN); syncache_add(&inc, &to, th, inp, &so, m); /* * Entry added to syncache and mbuf consumed. * Everything already unlocked by syncache_add(). */ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return; } /* * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. */ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return; dropwithreset: INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_INFO_WUNLOCK(&V_tcbinfo); if (inp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(inp); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); m = NULL; /* mbuf chain got consumed. */ goto drop; dropunlock: INP_INFO_WLOCK_ASSERT(&V_tcbinfo); if (inp != NULL) INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_tcbinfo); drop: INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); if (s != NULL) free(s, M_TCPLOG); if (m != NULL) m_freem(m); } static void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos) { INIT_VNET_INET(tp->t_vnet); int thflags, acked, ourfinisacked, needoutput = 0; int headlocked = 1; int rstreason, todrop, win; u_long tiwin; struct tcpopt to; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif thflags = th->th_flags; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ tp->t_rcvtime = ticks; if (TCPS_HAVEESTABLISHED(tp->t_state)) tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); /* * Unscale the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; /* * TCP ECN processing. */ if (tp->t_flags & TF_ECN_PERMIT) { switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags |= TF_ECN_SND_ECE; V_tcpstat.tcps_ecn_ce++; break; case IPTOS_ECN_ECT0: V_tcpstat.tcps_ecn_ect0++; break; case IPTOS_ECN_ECT1: V_tcpstat.tcps_ecn_ect1++; break; } if (thflags & TH_CWR) tp->t_flags &= ~TF_ECN_SND_ECE; /* * Congestion experienced. * Ignore if we are already trying to recover. */ if ((thflags & TH_ECE) && SEQ_LEQ(th->th_ack, tp->snd_recover)) { V_tcpstat.tcps_ecn_rcwnd++; tcp_congestion_exp(tp); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, ticks)) to.to_tsecr = 0; } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = ticks; } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && th->th_seq == tp->rcv_nxt && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && LIST_EMPTY(&tp->t_segq) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT) && tp->t_dupacks < tcprexmtthresh) || ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && !IN_FASTRECOVERY(tp) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)))) { KASSERT(headlocked, ("%s: headlocked", __func__)); INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; /* * This is a pure ack for outstanding data. */ ++V_tcpstat.tcps_predack; /* * "bad retransmit" recovery. */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { ++V_tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) tp->t_rttlow = ticks - to.to_tsecr; tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); acked = th->th_ack - tp->snd_una; V_tcpstat.tcps_rcvackpack++; V_tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); ND6_HINT(tp); /* Some progress has been made. */ /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); sowwakeup(so); if (so->so_snd.sb_cc) (void) tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && tlen <= sbspace(&so->so_rcv)) { int newsize = 0; /* automatic sockbuf scaling */ KASSERT(headlocked, ("%s: headlocked", __func__)); INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; /* * This is a pure, in-sequence data packet * with nothing on the reassembly queue and * we have enough buffer space to take it. */ /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); ++V_tcpstat.tcps_preddat; tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; V_tcpstat.tcps_rcvpack++; V_tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); /* Some progress has been made */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* * Automatic sizing of receive socket buffer. Often the send * buffer size is not optimally adjusted to the actual network * conditions at hand (delay bandwidth product). Setting the * buffer size too small limits throughput on links with high * bandwidth and high delay (eg. trans-continental/oceanic links). * * On the receive side the socket buffer memory is only rarely * used to any significant extent. This allows us to be much * more aggressive in scaling the receive socket buffer. For * the case that the buffer space is actually used to a large * extent and we run out of kernel memory we can simply drop * the new segments; TCP on the sender will just retransmit it * later. Setting the buffer size too big may only consume too * much kernel memory if the application doesn't read() from * the socket or packet loss or reordering makes use of the * reassembly queue. * * The criteria to step up the receive buffer one notch are: * 1. the number of bytes received during the time it takes * one timestamp to be reflected back to us (the RTT); * 2. received bytes per RTT is within seven eighth of the * current socket buffer size; * 3. receive buffer size has not hit maximal automatic size; * * This algorithm does one step per RTT at most and only if * we receive a bulk stream w/o packet losses or reorderings. * Shrinking the buffer during idle times is not necessary as * it doesn't consume any memory when idle. * * TODO: Only step up if the application is actually serving * the buffer to better manage the socket buffer resources. */ if (V_tcp_do_autorcvbuf && to.to_tsecr && (so->so_rcv.sb_flags & SB_AUTOSIZE)) { if (to.to_tsecr > tp->rfbuf_ts && to.to_tsecr - tp->rfbuf_ts < hz) { if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) && so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { newsize = min(so->so_rcv.sb_hiwat + V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max); } /* Start over with next RTT. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; } else tp->rfbuf_cnt += tlen; /* add up */ } /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. * Give up when limit is reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } goto check_delack; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); /* Reset receive buffer auto scaling when not in bulk receive mode. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } break; /* * If the state is SYN_SENT: * if seg contains an ACK, but not for our SYN, drop the input. * if seg contains a RST, then drop the connection. * if seg does not contain SYN, then drop it. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if seg contains an ECE and ECN support is enabled, the stream * is ECN capable. * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) tp = tcp_drop(tp, ECONNREFUSED); if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) goto drop; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { V_tcpstat.tcps_connects++; soisconnected(so); #ifdef MAC SOCK_LOCK(so); mac_socketpeer_set_from_mbuf(m, so); SOCK_UNLOCK(so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += tp->rcv_wnd; tp->snd_una++; /* SYN is acked */ /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp) && tlen != 0) tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); else tp->t_flags |= TF_ACKNOW; if ((thflags & TH_ECE) && V_tcp_do_ecn) { tp->t_flags |= TF_ECN_PERMIT; V_tcpstat.tcps_ecn_shs++; } /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tp->t_state = TCPS_ESTABLISHED; tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. If segment contains CC option * and there is a cached CC, apply TAO test. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* * If there was no CC option, clear cached CC value. */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_state = TCPS_SYN_RECEIVED; } KASSERT(headlocked, ("%s: trimthenstep6: head not locked", __func__)); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; V_tcpstat.tcps_rcvpackafterwin++; V_tcpstat.tcps_rcvbyteafterwin += todrop; } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * do normal processing. * * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. */ case TCPS_LAST_ACK: case TCPS_CLOSING: break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. * * * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * Note: this does not take into account delayed ACKs, so * we should test against last_ack_sent instead of rcv_nxt. * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. * Note 2: Paul Watson's paper "Slipping in the Window" has shown * that brute force RST attacks are possible. To combat this, * we use a much stricter check while in the ESTABLISHED state, * only accepting RSTs where the sequence number is equal to * last_ack_sent. In all other states (the states in which a * RST is more likely), the more permissive check is used. * If we have multiple segments in flight, the initial reset * segment sequence numbers will be to the left of last_ack_sent, * but they will eventually catch up. * In any case, it never made sense to trim reset segments to * fit the receive window since RFC 1122 says: * 4.2.2.12 RST Segment: RFC-793 Section 3.4 * * A TCP SHOULD allow a received RST segment to include data. * * DISCUSSION * It has been suggested that a RST segment could contain * ASCII text that encoded and explained the cause of the * RST. No standard has yet been established for such * data. * * If the reset segment passes the sequence number test examine * the state: * SYN_RECEIVED STATE: * If passive open, return to LISTEN state. * If active open, inform user that connection was refused. * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: * Inform user that connection was reset, and close tcb. * CLOSING, LAST_ACK STATES: * Close the tcb. * TIME_WAIT STATE: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) { if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: if (V_tcp_insecure_rst == 0 && !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { V_tcpstat.tcps_badrst++; goto drop; } /* FALLTHROUGH */ case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: tp->t_state = TCPS_CLOSED; V_tcpstat.tcps_drops++; KASSERT(headlocked, ("%s: trimthenstep6: " "tcp_close: head not locked", __func__)); tp = tcp_close(tp); break; case TCPS_CLOSING: case TCPS_LAST_ACK: KASSERT(headlocked, ("%s: trimthenstep6: " "tcp_close.2: head not locked", __func__)); tp = tcp_close(tp); break; } } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { V_tcpstat.tcps_rcvduppack++; V_tcpstat.tcps_rcvdupbyte += tlen; V_tcpstat.tcps_pawsdrop++; if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; V_tcpstat.tcps_rcvduppack++; V_tcpstat.tcps_rcvdupbyte += todrop; } else { V_tcpstat.tcps_rcvpartduppack++; V_tcpstat.tcps_rcvpartdupbyte += todrop; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { char *s; KASSERT(headlocked, ("%s: trimthenstep6: tcp_close.3: head " "not locked", __func__)); if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket " "was closed, sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } tp = tcp_close(tp); V_tcpstat.tcps_rcvafterclose++; rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { V_tcpstat.tcps_rcvpackafterwin++; if (todrop >= tlen) { V_tcpstat.tcps_rcvbyteafterwin += tlen; /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; V_tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; } else V_tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } /* * If a SYN is in the window, then this is an * error and we send an RST and drop the connection. */ if (thflags & TH_SYN) { KASSERT(headlocked, ("%s: tcp_drop: trimthenstep6: " "head not locked", __func__)); tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; goto drop; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: V_tcpstat.tcps_connects++; soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_wnd = tiwin; } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { V_tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))) tcp_sack_doack(tp, &to, th->th_ack); if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { V_tcpstat.tcps_rcvdupack++; /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change), the ack is the biggest we've * seen and we've seen exactly our rexmt * threshhold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. * * When using TCP ECN, notify the peer that * we reduced the cwnd. */ if (!tcp_timer_active(tp, TT_REXMT) || th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && IN_FASTRECOVERY(tp))) { if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += tp->t_maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; /* * If we're doing sack, check to * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (tp->t_flags & TF_SACK_PERMIT) { if (IN_FASTRECOVERY(tp)) { tp->t_dupacks = 0; break; } } else if (V_tcp_do_newreno || V_tcp_do_ecn) { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } tcp_congestion_exp(tp); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (tp->t_flags & TF_SACK_PERMIT) { V_tcpstat.tcps_sack_recovery_episode++; tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("%s: dupacks not 1 or 2", __func__)); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg; (void) tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == tp->t_maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } else tp->t_dupacks = 0; break; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("%s: th_ack <= snd_una", __func__)); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { if (IN_FASTRECOVERY(tp)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->t_flags & TF_SACK_PERMIT) tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else { /* * Out of fast recovery. * Window inflation should have left us * with approximately snd_ssthresh * outstanding data. * But in case we would be inclined to * send a burst, better to do it via * the slow start mechanism. */ if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) tp->snd_cwnd = tp->snd_max - th->th_ack + tp->t_maxseg; else tp->snd_cwnd = tp->snd_ssthresh; } } } else { if (tp->t_dupacks >= tcprexmtthresh && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } tp->t_dupacks = 0; /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: KASSERT(headlocked, ("%s: process_ACK: head not locked", __func__)); INP_WLOCK_ASSERT(tp->t_inpcb); acked = th->th_ack - tp->snd_una; V_tcpstat.tcps_rcvackpack++; V_tcpstat.tcps_rcvackbyte += acked; /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { ++V_tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; /* XXX probably not required */ } /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) tp->t_rttlow = ticks - to.to_tsecr; tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * When new data is acked, open the congestion window. * If the window gives us less than ssthresh packets * in flight, open exponentially (maxseg per packet). * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). * If cwnd > maxseg^2, fix the cwnd increment at 1 byte * to avoid capping cwnd (as suggested in RFC2581). */ if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || !IN_FASTRECOVERY(tp)) { u_int cw = tp->snd_cwnd; u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = max((incr * incr / cw), 1); tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); } SOCKBUF_LOCK(&so->so_snd); if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); ourfinisacked = 1; } else { sbdrop_locked(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); /* Detect una wraparound. */ if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && !IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) EXIT_FASTRECOVERY(tp); tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { int timeout; soisdisconnected(so); timeout = (tcp_fast_finwait2_recycle) ? tcp_finwait2_timeout : tcp_maxidle; tcp_timer_activate(tp, TT_2MSL, timeout); } tp->t_state = TCPS_FIN_WAIT_2; } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { KASSERT(headlocked, ("%s: process_ACK: " "head not locked", __func__)); tcp_twstart(tp); INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { KASSERT(headlocked, ("%s: process_ACK: " "tcp_close: head not locked", __func__)); tp = tcp_close(tp); goto drop; } break; } } step6: KASSERT(headlocked, ("%s: step6: head not locked", __func__)); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) V_tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + so->so_rcv.sb_cc > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (u_long)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ KASSERT(headlocked, ("%s: dodata: head not locked", __func__)); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp)) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; V_tcpstat.tcps_rcvpack++; V_tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) tcp_update_sack_list(tp, save_start, save_start + tlen); #if 0 /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. * XXX: Unused. */ len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); #endif } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tp->t_state = TCPS_CLOSE_WAIT; break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: KASSERT(headlocked == 1, ("%s: dodata: " "TCP_FIN_WAIT_2: head not locked", __func__)); tcp_twstart(tp); INP_INFO_WUNLOCK(&V_tcbinfo); return; } } INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tcp_output(tp); check_delack: KASSERT(headlocked == 0, ("%s: check_delack: head locked", __func__)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); return; dropafterack: KASSERT(headlocked, ("%s: dropafterack: head not locked", __func__)); /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif KASSERT(headlocked, ("%s: headlocked should be 1", __func__)); INP_INFO_WUNLOCK(&V_tcbinfo); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; dropwithreset: KASSERT(headlocked, ("%s: dropwithreset: head not locked", __func__)); INP_INFO_WUNLOCK(&V_tcbinfo); if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); return; drop: /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); if (headlocked) INP_INFO_WUNLOCK(&V_tcbinfo); m_freem(m); } /* * Issue RST and make ACK acceptable to originator of segment. * The mbuf must still include the original packet header. * tp may be NULL. */ static void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif if (tp != NULL) { INP_WLOCK_ASSERT(tp->t_inpcb); } /* Don't bother if destination was broadcast/multicast. */ if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; #ifdef INET6 if (mtod(m, struct ip *)->ip_v == 6) { ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; /* IPv6 anycast check is done at tcp6_input() */ } else #endif { ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } /* Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; /* tcp_respond consumes the mbuf chain. */ if (th->th_flags & TH_ACK) { tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); } else { if (th->th_flags & TH_SYN) tlen++; tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } return; drop: m_freem(m); } /* * Parse TCP options and place in tcpopt. */ static void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { INIT_VNET_INET(curvnet); int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_SCALE; to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; #ifdef TCP_SIGNATURE /* * XXX In order to reply to a host which has set the * TCP_SIGNATURE option in its initial SYN, we have to * record the fact that the option was observed here * for the syncache code to perform the correct response. */ case TCPOPT_SIGNATURE: if (optlen != TCPOLEN_SIGNATURE) continue; to->to_flags |= TOF_SIGNATURE; to->to_signature = cp + 2; break; #endif case TCPOPT_SACK_PERMITTED: if (optlen != TCPOLEN_SACK_PERMITTED) continue; if (!(flags & TO_SYN)) continue; if (!V_tcp_do_sack) continue; to->to_flags |= TOF_SACKPERM; break; case TCPOPT_SACK: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) continue; if (flags & TO_SYN) continue; to->to_flags |= TOF_SACK; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks = cp + 2; V_tcpstat.tcps_sack_rcv_blocks++; break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ static void tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); INP_WLOCK_ASSERT(tp->t_inpcb); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == NULL) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_xmit_timer(struct tcpcb *tp, int rtt) { INIT_VNET_INET(tp->t_inpcb->inp_vnet); int delta; INP_WLOCK_ASSERT(tp->t_inpcb); V_tcpstat.tcps_rttupdated++; tp->t_rttupdated++; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing * interface without forcing IP to fragment; if bigger than * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES * to utilize large mbufs. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * Also take into account the space needed for options that we * send regularly. Make maxseg shorter by that amount to assure * that we can send maxseg amount of data even when the options * are present. Store the upper limit of the length of options plus * data in maxopd. * * In case of T/TCP, we call this routine during implicit connection * setup as well (offer = -1), to initialize maxseg from the cached * MSS of our peer. * * NOTE that this routine is only called when we process an incoming * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). */ void tcp_mss_update(struct tcpcb *tp, int offer, struct hc_metrics_lite *metricptr, int *mtuflags) { INIT_VNET_INET(tp->t_inpcb->inp_vnet); int mss; u_long maxmtu; struct inpcb *inp = tp->t_inpcb; struct hc_metrics_lite metrics; int origoffer = offer; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const size_t min_protoh = sizeof(struct tcpiphdr); #endif INP_WLOCK_ASSERT(tp->t_inpcb); /* Initialize. */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, mtuflags); tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; } else #endif { maxmtu = tcp_maxmtu(&inp->inp_inc, mtuflags); tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; } /* * No route to sender, stay with default mss and return. */ if (maxmtu == 0) { /* * In case we return early we need to initialize metrics * to a defined state as tcp_hc_get() would do for us * if there was no cache hit. */ if (metricptr != NULL) bzero(metricptr, sizeof(struct hc_metrics_lite)); return; } /* What have we got? */ switch (offer) { case 0: /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt as * already assigned to t_maxopd above. */ offer = tp->t_maxopd; break; case -1: /* * Offer == -1 means that we didn't receive SYN yet. */ /* FALLTHROUGH */ default: /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, V_tcp_minmss); } /* * rmx information is now retrieved from tcp_hostcache. */ tcp_hc_get(&inp->inp_inc, &metrics); if (metricptr != NULL) bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); /* * If there's a discovered mtu int tcp hostcache, use it * else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, V_tcp_v6mssdflt); } else #endif { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, V_tcp_mssdflt); } /* * XXX - The above conditional (mss = maxmtu - min_protoh) * probably violates the TCP spec. * The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ } mss = min(mss, offer); /* * Sanity check: make sure that maxopd will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. */ mss = max(mss, 64); /* * maxopd stores the maximum length of data AND options * in a segment; maxseg is the amount of data in a normal * segment. We need to store this value (maxopd) apart * from maxseg, because now every segment carries options * and thus we normally have somewhat less data in segments. */ tp->t_maxopd = mss; /* * origoffer==-1 indicates that no segments were received yet. * In this case we just guess. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (origoffer == -1 || (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) mss -= TCPOLEN_TSTAMP_APPA; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif tp->t_maxseg = mss; } void tcp_mss(struct tcpcb *tp, int offer) { int rtt, mss; u_long bufsize; struct inpcb *inp; struct socket *so; struct hc_metrics_lite metrics; int mtuflags = 0; #ifdef INET6 int isipv6; #endif KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); tcp_mss_update(tp, offer, &metrics, &mtuflags); mss = tp->t_maxseg; inp = tp->t_inpcb; #ifdef INET6 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; #endif /* * If there's a pipesize, change the socket buffer to that size, * don't change if sb_hiwat is different than default (then it * has been changed on purpose with setsockopt). * Make the socket buffers an integral number of mss units; * if the mss is larger than the socket buffer, decrease the mss. */ so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) bufsize = metrics.rmx_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_snd); tp->t_maxseg = mss; SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); /* * While we're here, check the others too. */ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; V_tcpstat.tcps_usedrtt++; if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; V_tcpstat.tcps_usedrttvar++; } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } if (metrics.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshhold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); V_tcpstat.tcps_usedssthresh++; } if (metrics.rmx_bandwidth) tp->snd_bandwidth = metrics.rmx_bandwidth; /* * Set the slow-start flight size depending on whether this * is a local network or not. * * Extend this so we cache the cwnd too and retrieve it here. * Make cwnd even bigger than RFC3390 suggests but only if we * have previous experience with the remote host. Be careful * not make cwnd bigger than remote receive window or our own * send socket buffer. Maybe put some additional upper bound * on the retrieved cwnd. Should do incremental updates to * hostcache when cwnd collapses so next connection doesn't * overloads the path again. * * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. * We currently check only in syncache_socket for that. */ #define TCP_METRICS_CWND #ifdef TCP_METRICS_CWND if (metrics.rmx_cwnd) tp->snd_cwnd = max(mss, min(metrics.rmx_cwnd / 2, min(tp->snd_wnd, so->so_snd.sb_hiwat))); else #endif if (V_tcp_do_rfc3390) tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); #ifdef INET6 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || (!isipv6 && in_localaddr(inp->inp_faddr))) #else else if (in_localaddr(inp->inp_faddr)) #endif tp->snd_cwnd = mss * V_ss_fltsz_local; else tp->snd_cwnd = mss * V_ss_fltsz; /* Check the interface for TSO capabilities. */ if (mtuflags & CSUM_TSO) tp->t_flags |= TF_TSO; } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(struct in_conninfo *inc) { INIT_VNET_INET(curvnet); int mss = 0; u_long maxmtu = 0; u_long thcmtu = 0; size_t min_protoh; #ifdef INET6 int isipv6 = inc->inc_isipv6 ? 1 : 0; #endif KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 if (isipv6) { mss = V_tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc, NULL); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } else #endif { mss = V_tcp_mssdflt; maxmtu = tcp_maxmtu(inc, NULL); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct tcpiphdr); } if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) mss = max(maxmtu, thcmtu) - min_protoh; return (mss); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; u_long ocwnd = tp->snd_cwnd; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ if (tp->snd_cwnd > th->th_ack - tp->snd_una) tp->snd_cwnd -= th->th_ack - tp->snd_una; else tp->snd_cwnd = 0; tp->snd_cwnd += tp->t_maxseg; } Index: head/sys/netinet/tcp_output.c =================================================================== --- head/sys/netinet/tcp_output.c (revision 185087) +++ head/sys/netinet/tcp_output.c (revision 185088) @@ -1,1462 +1,1465 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #define TCPOUTFLAGS #include #include #include #include #include #ifdef TCPDEBUG #include #endif #ifdef IPSEC #include #endif /*IPSEC*/ #include #include #ifdef notyet extern struct mbuf *m_copypack(); #endif -int path_mtu_discovery = 1; +#ifdef VIMAGE_GLOBALS +int path_mtu_discovery; +int ss_fltsz; +int ss_fltsz_local; +int tcp_do_newreno; +int tcp_do_tso; +int tcp_do_autosndbuf; +int tcp_autosndbuf_inc; +int tcp_autosndbuf_max; +#endif + SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, path_mtu_discovery, 1, "Enable Path MTU Discovery"); -int ss_fltsz = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, ss_fltsz, 1, "Slow start flight size"); -int ss_fltsz_local = 4; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, ss_fltsz_local, 1, "Slow start flight size for local networks"); -int tcp_do_newreno = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, tcp_do_newreno, 0, "Enable NewReno Algorithms"); -int tcp_do_tso = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, tcp_do_tso, 0, "Enable TCP Segmentation Offload"); -int tcp_do_autosndbuf = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing"); -int tcp_autosndbuf_inc = 8*1024; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer"); -int tcp_autosndbuf_max = 256*1024; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, tcp_autosndbuf_max, 0, "Max size of automatic send buffer"); /* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { INIT_VNET_INET(tp->t_inpcb->inp_vnet); struct socket *so = tp->t_inpcb->inp_socket; long len, recwin, sendwin; int off, flags, error; struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #ifdef IPSEC unsigned ipsec_optlen = 0; #endif int idle, sendalot; int sack_rxmit, sack_bytes_rxmt; struct sackhole *p; int tso = 0; struct tcpopt to; #if 0 int maxburst = TCP_MAXBURST; #endif #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif INP_WLOCK_ASSERT(tp->t_inpcb); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- * slow start to get ack "clock" running again. * * Set the slow-start flight size depending on whether * this is a local network or not. */ int ss = V_ss_fltsz; #ifdef INET6 if (isipv6) { if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) ss = V_ss_fltsz_local; } else #endif /* INET6 */ if (in_localaddr(tp->t_inpcb->inp_faddr)) ss = V_ss_fltsz_local; tp->snd_cwnd = tp->t_maxseg * ss; } tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); sendalot = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { long cwin; cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; goto after_sack_rexmit; } else /* Can rexmit part of the current hole */ len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit)); } else len = ((long)ulmin(cwin, p->end - p->rxmit)); off = p->rxmit - tp->snd_una; KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); if (len > 0) { sack_rxmit = 1; sendalot = 1; V_tcpstat.tcps_sack_rexmits++; V_tcpstat.tcps_sack_rexmit_bytes += min(len, tp->t_maxseg); } } after_sack_rexmit: /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; SOCKBUF_LOCK(&so->so_snd); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCEDATA) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.sb_cc) flags &= ~TH_FIN; sendwin = 1; } else { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.sb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); else { long cwin; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) - off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. */ if (len > 0) { cwin = tp->snd_cwnd - (tp->snd_nxt - tp->sack_newdata) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; len = lmin(len, cwin); } } } /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data and if we don't * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; off--, len++; } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. */ len = 0; if (sendwin == 0) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_timer_active(tp, TT_PERSIST)) tcp_setpersist(tp); } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwith product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwith (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. */ if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && so->so_snd.sb_cc < V_tcp_autosndbuf_max && sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(&so->so_snd, min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max), so, curthread)) so->so_snd.sb_flags &= ~SB_AUTOSIZE; } } /* * Truncate to the maximum segment length or enable TCP Segmentation * Offloading (if supported by hardware) and ensure that FIN is removed * if the length no longer contains the last data byte. * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and * IP options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per generated * segment or packet. * * The length of TSO bursts is limited to TCP_MAXWIN. That limit and * removal of FIN (if not already catched here) are handled later after * the exact length of the TCP options are known. */ #ifdef IPSEC /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ ipsec_optlen = ipsec_hdrsiz_tcp(tp); #endif if (len > tp->t_maxseg) { if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && tp->t_inpcb->inp_options == NULL && tp->t_inpcb->in6p_options == NULL #ifdef IPSEC && ipsec_optlen == 0 #endif ) { tso = 1; } else { len = tp->t_maxseg; sendalot = 1; tso = 0; } } if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } recwin = sbspace(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) * - we need to retransmit */ if (len) { if (len >= tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.sb_cc && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (sack_rxmit) goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments, or at least 50% of the maximum possible * window, then want to send a window update to peer. * Skip this if the connection is in T/TCP half-open state. * Don't send pure window updates when the peer has closed * the connection and won't ever send more data. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale) - (tp->rcv_adv - tp->rcv_nxt); if (adv >= (long) (2 * tp->t_maxseg)) goto send; if (2 * adv >= (long) so->so_rcv.sb_hiwat) goto send; } /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_GT(tp->snd_max, tp->snd_una) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_timer_active(tp, TT_PERSIST) * is true when we are in persist state. * (tp->t_flags & TF_FORCEDATA) * is set when we are called to send a persist packet. * tcp_timer_active(tp, TT_REXMT) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(&so->so_snd); return (0); send: SOCKBUF_LOCK_ASSERT(&so->so_snd); /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); /* * Compute options for segment. * We only have to care about SYN and established connection * segments. Options for SYN-ACK segments are handled in TCP * syncache. */ if ((tp->t_flags & TF_NOOPT) == 0) { to.to_flags = 0; /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); to.to_flags |= TOF_MSS; } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { to.to_tsval = ticks + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = ticks; } /* Selective ACK's. */ if (tp->t_flags & TF_SACK_PERMIT) { if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } } #ifdef TCP_SIGNATURE /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); } #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #ifdef IPSEC ipoptlen += ipsec_optlen; #endif /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxopd length. * Clear the FIN bit because we cut off the tail of * the segment. * * When doing TSO limit a burst to TCP_MAXWIN minus the * IP, TCP and Options length to keep ip->ip_len from * overflowing. Prevent the last segment from being * fractional thus making them all equal sized and set * the flag to continue sending. TSO is disabled when * IP options or IPSEC are present. */ if (len + optlen + ipoptlen > tp->t_maxopd) { flags &= ~TH_FIN; if (tso) { if (len > TCP_MAXWIN - hdrlen - optlen) { len = TCP_MAXWIN - hdrlen - optlen; len = len - (len % (tp->t_maxopd - optlen)); sendalot = 1; } else if (tp->t_flags & TF_NEEDFIN) sendalot = 1; } else { len = tp->t_maxopd - optlen - ipoptlen; sendalot = 1; } } /*#ifdef DIAGNOSTIC*/ #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); /*#endif*/ /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further down. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from * the template for sends on this connection. */ if (len) { struct mbuf *mb; u_int moff; if ((tp->t_flags & TF_FORCEDATA) && len == 1) V_tcpstat.tcps_sndprobe++; else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { V_tcpstat.tcps_sndrexmitpack++; V_tcpstat.tcps_sndrexmitbyte += len; } else { V_tcpstat.tcps_sndpack++; V_tcpstat.tcps_sndbyte += len; } #ifdef notyet if ((m = m_copypack(so->so_snd.sb_mb, off, (int)len, max_linkhdr + hdrlen)) == 0) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; goto out; } /* * m_copypack left space for our hdr; use it. */ m->m_len += hdrlen; m->m_data -= hdrlen; #else MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; goto out; } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { SOCKBUF_UNLOCK(&so->so_snd); m_freem(m); error = ENOBUFS; goto out; } } #endif m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf * to the offset in the socket buffer chain. */ mb = sbsndptr(&so->so_snd, off, len, &moff); if (len <= MHLEN - hdrlen - max_linkhdr) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t) + hdrlen); m->m_len += len; } else { m->m_next = m_copy(mb, moff, (int)len); if (m->m_next == NULL) { SOCKBUF_UNLOCK(&so->so_snd); (void) m_free(m); error = ENOBUFS; goto out; } } #endif /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only * give data to the user when a buffer fills or * a PUSH comes in.) */ if (off + len == so->so_snd.sb_cc) flags |= TH_PUSH; SOCKBUF_UNLOCK(&so->so_snd); } else { SOCKBUF_UNLOCK(&so->so_snd); if (tp->t_flags & TF_ACKNOW) V_tcpstat.tcps_sndacks++; else if (flags & (TH_SYN|TH_FIN|TH_RST)) V_tcpstat.tcps_sndctrl++; else if (SEQ_GT(tp->snd_up, tp->snd_una)) V_tcpstat.tcps_sndurg++; else V_tcpstat.tcps_sndwinup++; MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { MH_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(tp->t_inpcb, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(tp->t_inpcb, ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(tp->t_inpcb, ip, th); } /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are starting a connection, send ECN setup * SYN packet. If we are on a retransmit, we may * resend those bits a number of times as per * RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { if (tp->t_rxtshift >= 1) { if (tp->t_rxtshift <= V_tcp_ecn_maxretries) flags |= TH_ECE|TH_CWR; } else flags |= TH_ECE|TH_CWR; } if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_ECN_PERMIT)) { /* * If the peer has ECN, mark data packets with * ECN capable transmission (ECT). * Ignore pure ack packets, retransmissions and window probes. */ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !((tp->t_flags & TF_FORCEDATA) && len == 1)) { #ifdef INET6 if (isipv6) ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); else #endif ip->ip_tos |= IPTOS_ECN_ECT0; V_tcpstat.tcps_ecn_ect0++; } /* * Reply with proper ECN notifications. */ if (tp->t_flags & TF_ECN_SND_CWR) { flags |= TH_CWR; tp->t_flags &= ~TF_ECN_SND_CWR; } if (tp->t_flags & TF_ECN_SND_ECE) flags |= TH_ECE; } /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only * packets, we do not want the sequence number of the * retransmitted packet, we want the sequence number * of the next unsent octet. So, if there is no data * (and no SYN or FIN), use snd_max instead of snd_nxt * when filling in ti_seq. But if we are in persist * state, snd_max might reflect one byte beyond the * right edge of the window, so use snd_nxt in that * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len || (flags & (TH_SYN|TH_FIN)) || tcp_timer_active(tp, TT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); } else { th->th_seq = htonl(p->rxmit); p->rxmit += len; tp->sackhint.sack_bytes_rexmit += len; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. */ if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && recwin < (long)tp->t_maxseg) recwin = 0; if (recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) recwin = (long)(tp->rcv_adv - tp->rcv_nxt); if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) recwin = (long)TCP_MAXWIN << tp->rcv_scale; /* * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. The * case is handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised * a 0 window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is attempting * to read more data than can be buffered prior to transmitting on * the connection. */ if (recwin == 0) tp->t_flags |= TF_RXWIN0SENT; else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull * the urgent pointer to the left edge of the send window * so that it doesn't drift into the send window on sequence * number wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) { int sigoff = to.to_signature - opt; tcp_signature_compute(m, 0, len, optlen, (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND); } #endif /* * Put TCP length in extended header, and then * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #ifdef INET6 if (isipv6) /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), sizeof(struct tcphdr) + optlen + len); else #endif /* INET6 */ { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. * XXX: Fixme: This is currently not the case for IPv6. */ if (tso) { m->m_pkthdr.csum_flags = CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; } /* * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. */ if ((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) { tcp_seq startseq = tp->snd_nxt; /* * Advance snd_nxt over sequence space of this segment. */ if (flags & (TH_SYN|TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. */ if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; V_tcpstat.tcps_segstimed++; } } /* * Set retransmit timer if not currently set, * and not doing a pure ack or a keep-alive probe. * Initial value for retransmit timer is smoothed * round-trip time + 2 * round-trip time variance. * Initialize shift counter which is used for backoff * of retransmit time. */ timer: if (!tcp_timer_active(tp, TT_REXMT) && ((sack_rxmit && tp->snd_nxt != tp->snd_max) || (tp->snd_nxt != tp->snd_una))) { if (tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } } else { /* * Persist case, update snd_max but since we are in * persist mode (no window) we do not update snd_nxt. */ int xlen = len; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) tp->snd_max = tp->snd_nxt + len; } #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 if (!isipv6) #endif ipov->ih_len = save; } #endif /* * Fill in IP length and desired time to live and * send to IP level. There should be a better way * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before cksum calcuration, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. * Also, desired default hop limit might be changed via * Neighbor Discovery. */ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, NULL, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); } else #endif /* INET6 */ { ip->ip_len = m->m_pkthdr.len; #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every packet. * This might not be the best thing to do according to RFC3390 * Section 2. However the tcp hostcache migitates the problem * so it affects only the first tcp connection with a host. */ if (V_path_mtu_discovery) ip->ip_off |= IP_DF; error = ip_output(m, tp->t_inpcb->inp_options, NULL, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); } if (error) { /* * We know that the packet was lost, so back out the * sequence number advance, if any. * * If the error is EPERM the packet got blocked by the * local firewall. Normally we should terminate the * connection but the blocking may have been spurious * due to a firewall reconfiguration cycle. So we treat * it like a packet loss and let the retransmit timer and * timeouts do their work over time. * XXX: It is a POLA question whether calling tcp_drop right * away would be the really correct behavior instead. */ if (((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) && ((flags & TH_SYN) == 0) && (error != EPERM)) { if (sack_rxmit) { p->rxmit -= len; tp->sackhint.sack_bytes_rexmit -= len; KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); } else tp->snd_nxt -= len; } out: SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ switch (error) { case EPERM: tp->t_softerror = error; return (error); case ENOBUFS: if (!tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); tp->snd_cwnd = tp->t_maxseg; return (0); case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. * * tcp_mtudisc() will find out the new MTU and as * its last action, initiate retransmission, so it * is important to not do so here. * * If TSO was active we either got an interface * without TSO capabilits or TSO was turned off. * Disable it for this connection as too and * immediatly retry with MSS sized segments generated * by this function. */ if (tso) tp->t_flags &= ~TF_TSO; tcp_mtudisc(tp->t_inpcb, 0); return (0); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: case ENETUNREACH: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; return (0); } /* FALLTHROUGH */ default: return (error); } } V_tcpstat.tcps_sndtotal++; /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, * then remember the size of the advertised window. * Any pending ACK has now been sent. */ if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); if (tcp_timer_active(tp, TT_DELACK)) tcp_timer_activate(tp, TT_DELACK, 0); #if 0 /* * This completely breaks TCP if newreno is turned on. What happens * is that if delayed-acks are turned on on the receiver, this code * on the transmitter effectively destroys the TCP window, forcing * it to four packets (1.5Kx4 = 6K window). */ if (sendalot && (!V_tcp_do_newreno || --maxburst)) goto again; #endif if (sendalot) goto again; return (0); } void tcp_setpersist(struct tcpcb *tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; int tt; if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); /* * Start/restart persistance timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], TCPTV_PERSMIN, TCPTV_PERSMAX); tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } /* * Insert TCP options according to the supplied parameters to the place * optp in a consistent way. Can handle unaligned destinations. * * The order of the option processing is crucial for optimal packing and * alignment for the scarce option space. * * The optimal order for a SYN/SYN-ACK segment is: * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) + * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40. * * The SACK options should be last. SACK blocks consume 8*n+2 bytes. * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks). * At minimum we need 10 bytes (to generate 1 SACK block). If both * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present, * we only have 10 bytes for SACK options (40 - (12 + 18)). */ int tcp_addoptions(struct tcpopt *to, u_char *optp) { INIT_VNET_INET(curvnet); u_int mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { if ((to->to_flags & mask) != mask) continue; if (optlen == TCP_MAXOLEN) break; switch (to->to_flags & mask) { case TOF_MSS: while (optlen % 4) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG) continue; optlen += TCPOLEN_MAXSEG; *optp++ = TCPOPT_MAXSEG; *optp++ = TCPOLEN_MAXSEG; to->to_mss = htons(to->to_mss); bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss)); optp += sizeof(to->to_mss); break; case TOF_SCALE: while (!optlen || optlen % 2 != 1) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW) continue; optlen += TCPOLEN_WINDOW; *optp++ = TCPOPT_WINDOW; *optp++ = TCPOLEN_WINDOW; *optp++ = to->to_wscale; break; case TOF_SACKPERM: while (optlen % 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED) continue; optlen += TCPOLEN_SACK_PERMITTED; *optp++ = TCPOPT_SACK_PERMITTED; *optp++ = TCPOLEN_SACK_PERMITTED; break; case TOF_TS: while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP) continue; optlen += TCPOLEN_TIMESTAMP; *optp++ = TCPOPT_TIMESTAMP; *optp++ = TCPOLEN_TIMESTAMP; to->to_tsval = htonl(to->to_tsval); to->to_tsecr = htonl(to->to_tsecr); bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval)); optp += sizeof(to->to_tsval); bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); optp += sizeof(to->to_tsecr); break; case TOF_SIGNATURE: { int siglen = TCPOLEN_SIGNATURE - 2; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) continue; optlen += TCPOLEN_SIGNATURE; *optp++ = TCPOPT_SIGNATURE; *optp++ = TCPOLEN_SIGNATURE; to->to_signature = optp; while (siglen--) *optp++ = 0; break; } case TOF_SACK: { int sackblks = 0; struct sackblk *sack = (struct sackblk *)to->to_sacks; tcp_seq sack_seq; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK) continue; optlen += TCPOLEN_SACKHDR; *optp++ = TCPOPT_SACK; sackblks = min(to->to_nsacks, (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK; while (sackblks--) { sack_seq = htonl(sack->start); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); sack_seq = htonl(sack->end); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); optlen += TCPOLEN_SACK; sack++; } V_tcpstat.tcps_sack_send_blocks++; break; } default: panic("%s: unknown TCP option type", __func__); break; } } /* Terminate and pad TCP options to a 4 byte boundary. */ if (optlen % 4) { optlen += TCPOLEN_EOL; *optp++ = TCPOPT_EOL; } /* * According to RFC 793 (STD0007): * "The content of the header beyond the End-of-Option option * must be header padding (i.e., zero)." * and later: "The padding is composed of zeros." */ while (optlen % 4) { optlen += TCPOLEN_PAD; *optp++ = TCPOPT_PAD; } KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__)); return (optlen); } Index: head/sys/netinet/tcp_reass.c =================================================================== --- head/sys/netinet/tcp_reass.c (revision 185087) +++ head/sys/netinet/tcp_reass.c (revision 185088) @@ -1,289 +1,297 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ +#ifdef VIMAGE_GLOBALS +static int tcp_reass_maxseg; +int tcp_reass_qsize; +static int tcp_reass_maxqlen; +static int tcp_reass_overflows; +#endif + SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); -static int tcp_reass_maxseg = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, tcp_reass_maxseg, 0, "Global maximum number of TCP Segments in Reassembly Queue"); -int tcp_reass_qsize = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, tcp_reass_qsize, 0, "Global number of TCP Segments currently in Reassembly Queue"); -static int tcp_reass_maxqlen = 48; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW, tcp_reass_maxqlen, 0, "Maximum number of TCP Segments per individual Reassembly Queue"); -static int tcp_reass_overflows = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) { INIT_VNET_INET(curvnet); V_tcp_reass_maxseg = nmbclusters / 16; uma_zone_set_max(tcp_reass_zone, V_tcp_reass_maxseg); } uma_zone_t tcp_reass_zone; void tcp_reass_init(void) { INIT_VNET_INET(curvnet); + + V_tcp_reass_maxseg = 0; + V_tcp_reass_qsize = 0; + V_tcp_reass_maxqlen = 48; + V_tcp_reass_overflows = 0; V_tcp_reass_maxseg = nmbclusters / 16; TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", &V_tcp_reass_maxseg); tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcp_reass_zone, V_tcp_reass_maxseg); EVENTHANDLER_REGISTER(nmbclusters_change, tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); } int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) { INIT_VNET_INET(curvnet); struct tseg_qent *q; struct tseg_qent *p = NULL; struct tseg_qent *nq; struct tseg_qent *te = NULL; struct socket *so = tp->t_inpcb->inp_socket; int flags; INP_WLOCK_ASSERT(tp->t_inpcb); /* * XXX: tcp_reass() is rather inefficient with its data structures * and should be rewritten (see NetBSD for optimizations). While * doing that it should move to its own file tcp_reass.c. */ /* * Call with th==NULL after become established to * force pre-ESTABLISHED data up to user socket. */ if (th == NULL) goto present; /* * Limit the number of segments in the reassembly queue to prevent * holding on to too many segments (and thus running out of mbufs). * Make sure to let the missing segment through which caused this * queue. Always keep one global queue entry spare to be able to * process the missing segment. */ if (th->th_seq != tp->rcv_nxt && (V_tcp_reass_qsize + 1 >= V_tcp_reass_maxseg || tp->t_segqlen >= V_tcp_reass_maxqlen)) { V_tcp_reass_overflows++; V_tcpstat.tcps_rcvmemdrop++; m_freem(m); *tlenp = 0; return (0); } /* * Allocate a new queue entry. If we can't, or hit the zone limit * just drop the pkt. */ te = uma_zalloc(tcp_reass_zone, M_NOWAIT); if (te == NULL) { V_tcpstat.tcps_rcvmemdrop++; m_freem(m); *tlenp = 0; return (0); } tp->t_segqlen++; V_tcp_reass_qsize++; /* * Find a segment which begins after this one does. */ LIST_FOREACH(q, &tp->t_segq, tqe_q) { if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) break; p = q; } /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us. */ if (p != NULL) { int i; /* conversion to int (in i) handles seq wraparound */ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; if (i > 0) { if (i >= *tlenp) { V_tcpstat.tcps_rcvduppack++; V_tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); uma_zfree(tcp_reass_zone, te); tp->t_segqlen--; V_tcp_reass_qsize--; /* * Try to present any queued data * at the left window edge to the user. * This is needed after the 3-WHS * completes. */ goto present; /* ??? */ } m_adj(m, i); *tlenp -= i; th->th_seq += i; } } V_tcpstat.tcps_rcvoopack++; V_tcpstat.tcps_rcvoobyte += *tlenp; /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ while (q) { int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; if (i <= 0) break; if (i < q->tqe_len) { q->tqe_th->th_seq += i; q->tqe_len -= i; m_adj(q->tqe_m, i); break; } nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; V_tcp_reass_qsize--; q = nq; } /* Insert the new segment queue entry into place. */ te->tqe_m = m; te->tqe_th = th; te->tqe_len = *tlenp; if (p == NULL) { LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); } else { LIST_INSERT_AFTER(p, te, tqe_q); } present: /* * Present data to user, advancing rcv_nxt through * completed sequence space. */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); q = LIST_FIRST(&tp->t_segq); if (!q || q->tqe_th->th_seq != tp->rcv_nxt) return (0); SOCKBUF_LOCK(&so->so_rcv); do { tp->rcv_nxt += q->tqe_len; flags = q->tqe_th->th_flags & TH_FIN; nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(q->tqe_m); else sbappendstream_locked(&so->so_rcv, q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; V_tcp_reass_qsize--; q = nq; } while (q && q->tqe_th->th_seq == tp->rcv_nxt); ND6_HINT(tp); sorwakeup_locked(so); return (flags); } Index: head/sys/netinet/tcp_sack.c =================================================================== --- head/sys/netinet/tcp_sack.c (revision 185087) +++ head/sys/netinet/tcp_sack.c (revision 185088) @@ -1,683 +1,686 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 */ /*- * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #include extern struct uma_zone *sack_hole_zone; +#ifdef VIMAGE_GLOBALS +int tcp_do_sack; +int tcp_sack_maxholes; +int tcp_sack_globalmaxholes; +int tcp_sack_globalholes; +#endif + SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK"); -int tcp_do_sack = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW, tcp_do_sack, 0, "Enable/Disable TCP SACK support"); TUNABLE_INT("net.inet.tcp.sack.enable", &tcp_do_sack); -static int tcp_sack_maxholes = 128; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_RW, tcp_sack_maxholes, 0, "Maximum number of TCP SACK holes allowed per connection"); -static int tcp_sack_globalmaxholes = 65536; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_RW, tcp_sack_globalmaxholes, 0, "Global maximum number of TCP SACK holes"); -static int tcp_sack_globalholes = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_RD, tcp_sack_globalholes, 0, "Global number of TCP SACK holes currently allocated"); /* * This function is called upon receipt of new valid data (while not in * header prediction mode), and it updates the ordered list of sacks. */ void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) { /* * First reported block MUST be the most recent one. Subsequent * blocks SHOULD be in the order in which they arrived at the * receiver. These two conditions make the implementation fully * compliant with RFC 2018. */ struct sackblk head_blk, saved_blks[MAX_SACK_BLKS]; int num_head, num_saved, i; INP_WLOCK_ASSERT(tp->t_inpcb); /* Check arguments. */ KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end")); /* SACK block for the received segment. */ head_blk.start = rcv_start; head_blk.end = rcv_end; /* * Merge updated SACK blocks into head_blk, and save unchanged SACK * blocks into saved_blks[]. num_saved will have the number of the * saved SACK blocks. */ num_saved = 0; for (i = 0; i < tp->rcv_numsacks; i++) { tcp_seq start = tp->sackblks[i].start; tcp_seq end = tp->sackblks[i].end; if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { /* * Discard this SACK block. */ } else if (SEQ_LEQ(head_blk.start, end) && SEQ_GEQ(head_blk.end, start)) { /* * Merge this SACK block into head_blk. This SACK * block itself will be discarded. */ if (SEQ_GT(head_blk.start, start)) head_blk.start = start; if (SEQ_LT(head_blk.end, end)) head_blk.end = end; } else { /* * Save this SACK block. */ saved_blks[num_saved].start = start; saved_blks[num_saved].end = end; num_saved++; } } /* * Update SACK list in tp->sackblks[]. */ num_head = 0; if (SEQ_GT(head_blk.start, tp->rcv_nxt)) { /* * The received data segment is an out-of-order segment. Put * head_blk at the top of SACK list. */ tp->sackblks[0] = head_blk; num_head = 1; /* * If the number of saved SACK blocks exceeds its limit, * discard the last SACK block. */ if (num_saved >= MAX_SACK_BLKS) num_saved--; } if (num_saved > 0) { /* * Copy the saved SACK blocks back. */ bcopy(saved_blks, &tp->sackblks[num_head], sizeof(struct sackblk) * num_saved); } /* Save the number of SACK blocks. */ tp->rcv_numsacks = num_head + num_saved; } /* * Delete all receiver-side SACK information. */ void tcp_clean_sackreport(struct tcpcb *tp) { int i; INP_WLOCK_ASSERT(tp->t_inpcb); tp->rcv_numsacks = 0; for (i = 0; i < MAX_SACK_BLKS; i++) tp->sackblks[i].start = tp->sackblks[i].end=0; } /* * Allocate struct sackhole. */ static struct sackhole * tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end) { INIT_VNET_INET(tp->t_inpcb->inp_vnet); struct sackhole *hole; if (tp->snd_numholes >= V_tcp_sack_maxholes || V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) { V_tcpstat.tcps_sack_sboverflow++; return NULL; } hole = (struct sackhole *)uma_zalloc(sack_hole_zone, M_NOWAIT); if (hole == NULL) return NULL; hole->start = start; hole->end = end; hole->rxmit = start; tp->snd_numholes++; V_tcp_sack_globalholes++; return hole; } /* * Free struct sackhole. */ static void tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole) { INIT_VNET_INET(tp->t_vnet); uma_zfree(sack_hole_zone, hole); tp->snd_numholes--; V_tcp_sack_globalholes--; KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0")); KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0")); } /* * Insert new SACK hole into scoreboard. */ static struct sackhole * tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end, struct sackhole *after) { struct sackhole *hole; /* Allocate a new SACK hole. */ hole = tcp_sackhole_alloc(tp, start, end); if (hole == NULL) return NULL; /* Insert the new SACK hole into scoreboard. */ if (after != NULL) TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink); else TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink); /* Update SACK hint. */ if (tp->sackhint.nexthole == NULL) tp->sackhint.nexthole = hole; return hole; } /* * Remove SACK hole from scoreboard. */ static void tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole) { /* Update SACK hint. */ if (tp->sackhint.nexthole == hole) tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink); /* Remove this SACK hole. */ TAILQ_REMOVE(&tp->snd_holes, hole, scblink); /* Free this SACK hole. */ tcp_sackhole_free(tp, hole); } /* * Process cumulative ACK and the TCP SACK option to update the scoreboard. * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of * the sequence space). */ void tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) { struct sackhole *cur, *temp; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp; int i, j, num_sack_blks; INP_WLOCK_ASSERT(tp->t_inpcb); num_sack_blks = 0; /* * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist, * treat [SND.UNA, SEG.ACK) as if it is a SACK block. */ if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) { sack_blocks[num_sack_blks].start = tp->snd_una; sack_blocks[num_sack_blks++].end = th_ack; } /* * Append received valid SACK blocks to sack_blocks[], but only if we * received new blocks from the other side. */ if (to->to_flags & TOF_SACK) { for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); sack.start = ntohl(sack.start); sack.end = ntohl(sack.end); if (SEQ_GT(sack.end, sack.start) && SEQ_GT(sack.start, tp->snd_una) && SEQ_GT(sack.start, th_ack) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, tp->snd_una) && SEQ_LEQ(sack.end, tp->snd_max)) sack_blocks[num_sack_blks++] = sack; } } /* * Return if SND.UNA is not advanced and no valid SACK block is * received. */ if (num_sack_blks == 0) return; /* * Sort the SACK blocks so we can update the scoreboard with just one * pass. The overhead of sorting upto 4+1 elements is less than * making upto 4+1 passes over the scoreboard. */ for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { sack = sack_blocks[i]; sack_blocks[i] = sack_blocks[j]; sack_blocks[j] = sack; } } } if (TAILQ_EMPTY(&tp->snd_holes)) /* * Empty scoreboard. Need to initialize snd_fack (it may be * uninitialized or have a bogus value). Scoreboard holes * (from the sack blocks received) are created later below * (in the logic that adds holes to the tail of the * scoreboard). */ tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack); /* * In the while-loop below, incoming SACK blocks (sack_blocks[]) and * SACK holes (snd_holes) are traversed from their tails with just * one pass in order to reduce the number of compares especially when * the bandwidth-delay product is large. * * Note: Typically, in the first RTT of SACK recovery, the highest * three or four SACK blocks with the same ack number are received. * In the second RTT, if retransmitted data segments are not lost, * the highest three or four SACK blocks with ack number advancing * are received. */ sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */ if (SEQ_LT(tp->snd_fack, sblkp->start)) { /* * The highest SACK block is beyond fack. Append new SACK * hole at the tail. If the second or later highest SACK * blocks are also beyond the current fack, they will be * inserted by way of hole splitting in the while-loop below. */ temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL); if (temp != NULL) { tp->snd_fack = sblkp->end; /* Go to the previous sack block. */ sblkp--; } else { /* * We failed to add a new hole based on the current * sack block. Skip over all the sack blocks that * fall completely to the right of snd_fack and * proceed to trim the scoreboard based on the * remaining sack blocks. This also trims the * scoreboard for th_ack (which is sack_blocks[0]). */ while (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->start)) sblkp--; if (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->end)) tp->snd_fack = sblkp->end; } } else if (SEQ_LT(tp->snd_fack, sblkp->end)) /* fack is advanced. */ tp->snd_fack = sblkp->end; /* We must have at least one SACK hole in scoreboard. */ KASSERT(!TAILQ_EMPTY(&tp->snd_holes), ("SACK scoreboard must not be empty")); cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */ /* * Since the incoming sack blocks are sorted, we can process them * making one sweep of the scoreboard. */ while (sblkp >= sack_blocks && cur != NULL) { if (SEQ_GEQ(sblkp->start, cur->end)) { /* * SACKs data beyond the current hole. Go to the * previous sack block. */ sblkp--; continue; } if (SEQ_LEQ(sblkp->end, cur->start)) { /* * SACKs data before the current hole. Go to the * previous hole. */ cur = TAILQ_PREV(cur, sackhole_head, scblink); continue; } tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start); KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); if (SEQ_LEQ(sblkp->start, cur->start)) { /* Data acks at least the beginning of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Acks entire hole, so delete hole. */ temp = cur; cur = TAILQ_PREV(cur, sackhole_head, scblink); tcp_sackhole_remove(tp, temp); /* * The sack block may ack all or part of the * next hole too, so continue onto the next * hole. */ continue; } else { /* Move start of hole forward. */ cur->start = sblkp->end; cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); } } else { /* Data acks at least the end of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Move end of hole backward. */ cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } else { /* * ACKs some data in middle of a hole; need * to split current hole */ temp = tcp_sackhole_insert(tp, sblkp->end, cur->end, cur); if (temp != NULL) { if (SEQ_GT(cur->rxmit, temp->rxmit)) { temp->rxmit = cur->rxmit; tp->sackhint.sack_bytes_rexmit += (temp->rxmit - temp->start); } cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } } } tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start); /* * Testing sblkp->start against cur->start tells us whether * we're done with the sack block or the sack hole. * Accordingly, we advance one or the other. */ if (SEQ_LEQ(sblkp->start, cur->start)) cur = TAILQ_PREV(cur, sackhole_head, scblink); else sblkp--; } } /* * Free all SACK holes to clear the scoreboard. */ void tcp_free_sackholes(struct tcpcb *tp) { struct sackhole *q; INP_WLOCK_ASSERT(tp->t_inpcb); while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL) tcp_sackhole_remove(tp, q); tp->sackhint.sack_bytes_rexmit = 0; KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0")); KASSERT(tp->sackhint.nexthole == NULL, ("tp->sackhint.nexthole == NULL")); } /* * Partial ack handling within a sack recovery episode. Keeping this very * simple for now. When a partial ack is received, force snd_cwnd to a value * that will allow the sender to transmit no more than 2 segments. If * necessary, a better scheme can be adopted at a later point, but for now, * the goal is to prevent the sender from bursting a large amount of data in * the midst of sack recovery. */ void tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) { int num_segs = 1; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; /* Send one or 2 segments based on how much new data was acked. */ if (((th->th_ack - tp->snd_una) / tp->t_maxseg) > 2) num_segs = 2; tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg); if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); } #if 0 /* * Debug version of tcp_sack_output() that walks the scoreboard. Used for * now to sanity check the hint. */ static struct sackhole * tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt) { struct sackhole *p; INP_WLOCK_ASSERT(tp->t_inpcb); *sack_bytes_rexmt = 0; TAILQ_FOREACH(p, &tp->snd_holes, scblink) { if (SEQ_LT(p->rxmit, p->end)) { if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ continue; } *sack_bytes_rexmt += (p->rxmit - p->start); break; } *sack_bytes_rexmt += (p->rxmit - p->start); } return (p); } #endif /* * Returns the next hole to retransmit and the number of retransmitted bytes * from the scoreboard. We store both the next hole and the number of * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK * reception). This avoids scoreboard traversals completely. * * The loop here will traverse *at most* one link. Here's the argument. For * the loop to traverse more than 1 link before finding the next hole to * retransmit, we would need to have at least 1 node following the current * hint with (rxmit == end). But, for all holes following the current hint, * (start == rxmit), since we have not yet retransmitted from them. * Therefore, in order to traverse more 1 link in the loop below, we need to * have at least one node following the current hint with (start == rxmit == * end). But that can't happen, (start == end) means that all the data in * that hole has been sacked, in which case, the hole would have been removed * from the scoreboard. */ struct sackhole * tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) { struct sackhole *hole = NULL; INP_WLOCK_ASSERT(tp->t_inpcb); *sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit; hole = tp->sackhint.nexthole; if (hole == NULL || SEQ_LT(hole->rxmit, hole->end)) goto out; while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) { if (SEQ_LT(hole->rxmit, hole->end)) { tp->sackhint.nexthole = hole; break; } } out: return (hole); } /* * After a timeout, the SACK list may be rebuilt. This SACK information * should be used to avoid retransmitting SACKed data. This function * traverses the SACK list to see if snd_nxt should be moved forward. */ void tcp_sack_adjust(struct tcpcb *tp) { struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes); INP_WLOCK_ASSERT(tp->t_inpcb); if (cur == NULL) return; /* No holes */ if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) return; /* We're already beyond any SACKed blocks */ /*- * Two cases for which we want to advance snd_nxt: * i) snd_nxt lies between end of one hole and beginning of another * ii) snd_nxt lies between end of last hole and snd_fack */ while ((p = TAILQ_NEXT(cur, scblink)) != NULL) { if (SEQ_LT(tp->snd_nxt, cur->end)) return; if (SEQ_GEQ(tp->snd_nxt, p->start)) cur = p; else { tp->snd_nxt = p->start; return; } } if (SEQ_LT(tp->snd_nxt, cur->end)) return; tp->snd_nxt = tp->snd_fack; } Index: head/sys/netinet/tcp_subr.c =================================================================== --- head/sys/netinet/tcp_subr.c (revision 185087) +++ head/sys/netinet/tcp_subr.c (revision 185088) @@ -1,2254 +1,2300 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #include #ifdef INET6 #include #include #include #endif #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif #include #ifdef IPSEC #include #include #ifdef INET6 #include #endif #include #include #endif /*IPSEC*/ #include #include #include -int tcp_mssdflt = TCP_MSS; +#ifdef VIMAGE_GLOBALS +int tcp_mssdflt; #ifdef INET6 -int tcp_v6mssdflt = TCP6_MSS; +int tcp_v6mssdflt; #endif +int tcp_minmss; +int tcp_do_rfc1323; +static int icmp_may_rst; +static int tcp_isn_reseed_interval; +static int tcp_inflight_enable; +static int tcp_inflight_rttthresh; +static int tcp_inflight_min; +static int tcp_inflight_max; +static int tcp_inflight_stab; +#endif static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET(TD_TO_VNET(curthread)); int error, new; new = V_tcp_mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLTYPE_INT|CTLFLAG_RW, &tcp_mssdflt, 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); #ifdef INET6 static int sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET6(TD_TO_VNET(curthread)); int error, new; new = V_tcp_v6mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_v6mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLTYPE_INT|CTLFLAG_RW, &tcp_v6mssdflt, 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ -int tcp_minmss = TCP_MINMSS; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); -int tcp_do_rfc1323 = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, tcp_do_rfc1323, 0, "Enable rfc1323 (high performance TCP) extensions"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); static int tcp_tcbhashsize = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, tcbinfo.ipi_count, 0, "Number of active PCBs"); -static int icmp_may_rst = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, icmp_may_rst, 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); -static int tcp_isn_reseed_interval = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); /* * TCP bandwidth limiting sysctls. Note that the default lower bound of * 1024 exists only for debugging. A good production default would be * something like 6100. */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, "TCP inflight data limiting"); -static int tcp_inflight_enable = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); static int tcp_inflight_debug = 0; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); -static int tcp_inflight_rttthresh; SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW, &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I", "RTT threshold below which inflight will deactivate itself"); -static int tcp_inflight_min = 6144; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); -static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); -static int tcp_inflight_stab = 20; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); uma_zone_t sack_hole_zone; static struct inpcb *tcp_notify(struct inpcb *, int); static void tcp_isn_tick(void *); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 512 #endif /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; }; static uma_zone_t tcpcb_zone; MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); struct callout isn_callout; static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) /* * TCP initialization. */ static void tcp_zone_change(void *tag) { uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); uma_zone_set_max(tcpcb_zone, maxsockets); tcp_tw_zone_change(); } static int tcp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "tcpinp"); return (0); } void tcp_init(void) { INIT_VNET_INET(curvnet); + int hashsize; - int hashsize = TCBHASHSIZE; + V_blackhole = 0; + V_tcp_delack_enabled = 1; + V_drop_synfin = 0; + V_tcp_do_rfc3042 = 1; + V_tcp_do_rfc3390 = 1; + V_tcp_do_ecn = 0; + V_tcp_ecn_maxretries = 1; + V_tcp_insecure_rst = 0; + V_tcp_do_autorcvbuf = 1; + V_tcp_autorcvbuf_inc = 16*1024; + V_tcp_autorcvbuf_max = 256*1024; + + V_tcp_mssdflt = TCP_MSS; +#ifdef INET6 + V_tcp_v6mssdflt = TCP6_MSS; +#endif + V_tcp_minmss = TCP_MINMSS; + V_tcp_do_rfc1323 = 1; + V_icmp_may_rst = 1; + V_tcp_isn_reseed_interval = 0; + V_tcp_inflight_enable = 1; + V_tcp_inflight_min = 6144; + V_tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; + V_tcp_inflight_stab = 20; + + V_path_mtu_discovery = 1; + V_ss_fltsz = 1; + V_ss_fltsz_local = 4; + V_tcp_do_newreno = 1; + V_tcp_do_tso = 1; + V_tcp_do_autosndbuf = 1; + V_tcp_autosndbuf_inc = 8*1024; + V_tcp_autosndbuf_max = 256*1024; + + V_nolocaltimewait = 0; + + V_tcp_do_sack = 1; + V_tcp_sack_maxholes = 128; + V_tcp_sack_globalmaxholes = 65536; + V_tcp_sack_globalholes = 0; + tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_rexmit_slop = TCPTV_CPU_VAR; V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp"); LIST_INIT(&V_tcb); V_tcbinfo.ipi_listhead = &V_tcb; + hashsize = TCBHASHSIZE; TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); if (!powerof2(hashsize)) { printf("WARNING: TCB hash size not a power of 2\n"); hashsize = 512; /* safe default */ } tcp_tcbhashsize = hashsize; V_tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB, &V_tcbinfo.ipi_hashmask); V_tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB, &V_tcbinfo.ipi_porthashmask); V_tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) max_protohdr = TCP_MINPROTOHDR; if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR /* * These have to be type stable for the benefit of the timers. */ tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcpcb_zone, maxsockets); tcp_tw_init(); syncache_init(); tcp_hc_init(); tcp_reass_init(); ISN_LOCK_INIT(); callout_init(&isn_callout, CALLOUT_MPSAFE); tcp_isn_tick(NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } void tcp_fini(void *xtp) { callout_stop(&isn_callout); } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_WLOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } else #endif { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_x2 = 0; th->th_off = 5; th->th_flags = 0; th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct tcptemp *t; t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); if (t == NULL) return (NULL); tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t); return (t); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at ti and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the * segment ti, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then ti must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { INIT_VNET_INET(curvnet); int tlen; int win = 0; struct ip *ip; struct tcphdr *nth; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int ipflags = 0; struct inpcb *inp; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == 6; ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tp->t_inpcb; KASSERT(inp != NULL, ("tcp control block w/o inpcb")); INP_WLOCK_ASSERT(inp); } else inp = NULL; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } } if (m == NULL) { m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return; tlen = 0; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else { /* * reuse the mbuf. * XXX MRT We inherrit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* m_len is set later */ tlen = 0; #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, n_short); #undef xchg } #ifdef INET6 if (isipv6) { ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + tlen)); tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); } else #endif { tlen += sizeof (struct tcpiphdr); ip->ip_len = tlen; ip->ip_ttl = V_ip_defttl; if (V_path_mtu_discovery) ip->ip_off |= IP_DF; } m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_WLOCK_ASSERT(inp); mac_inpcb_create_mbuf(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_netinet_tcp_reply(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; nth->th_off = sizeof (struct tcphdr) >> 2; nth->th_flags = flags; if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #ifdef INET6 if (isipv6) { nth->th_sum = 0; nth->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen - sizeof(struct ip6_hdr)); ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); } else #endif /* INET6 */ { nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); } #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif #ifdef INET6 if (isipv6) (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); else #endif /* INET6 */ (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { INIT_VNET_INET(inp->inp_vnet); struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); tp = &tm->tcb; tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* Set up our timeouts. */ callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE); callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE); callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE); callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE); callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE); if (V_tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); tp->t_inpcb = inp; /* XXX */ /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; tp->t_bw_rtttime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; inp->inp_ppcb = tp; return (tp); /* XXX */ } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { INIT_VNET_INET(tp->t_inpcb->inp_vnet); struct socket *so = tp->t_inpcb->inp_socket; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; (void) tcp_output_reset(tp); V_tcpstat.tcps_drops++; } else V_tcpstat.tcps_conndrops++; if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { INIT_VNET_INET(tp->t_vnet); struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ INP_WLOCK_ASSERT(inp); /* * Make sure that all of our timers are stopped before we * delete the PCB. */ callout_stop(&tp->t_timers->tt_rexmt); callout_stop(&tp->t_timers->tt_persist); callout_stop(&tp->t_timers->tt_keep); callout_stop(&tp->t_timers->tt_2msl); callout_stop(&tp->t_timers->tt_delack); /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! */ if (tp->t_rttupdated >= 4) { struct hc_metrics_lite metrics; u_long ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occured on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (u_long)(tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; /* XXX: This wraps if the pipe is more than 4 Gbit per second */ metrics.rmx_bandwidth = tp->snd_bandwidth; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } /* free the reassembly queue, if any */ while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; V_tcp_reass_qsize--; } /* Disconnect offload device, if any. */ tcp_offload_detach(tp); tcp_free_sackholes(tp); inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(tcpcb_zone, tp); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { INIT_VNET_INET(tp->t_inpcb->inp_vnet); struct inpcb *inp = tp->t_inpcb; struct socket *so; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* Notify any offload devices of listener close */ if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_close(tp); in_pcbdrop(inp); V_tcpstat.tcps_closed++; KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); if (inp->inp_vflag & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_close: !SS_PROTOREF")); inp->inp_vflag &= ~INP_SOCKREF; INP_WUNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); return (NULL); } return (tp); } void tcp_drain(void) { VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) return; VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INIT_VNET_INET(vnet_iter); struct inpcb *inpb; struct tcpcb *tcpb; struct tseg_qent *te; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * usefull. */ INP_INFO_RLOCK(&V_tcbinfo); LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { if (inpb->inp_vflag & INP_TIMEWAIT) continue; INP_WLOCK(inpb); if ((tcpb = intotcpcb(inpb)) != NULL) { while ((te = LIST_FIRST(&tcpb->t_segq)) != NULL) { LIST_REMOVE(te, tqe_q); m_freem(te->tqe_m); uma_zfree(tcp_reass_zone, te); tcpb->t_segqlen--; V_tcp_reass_qsize--; } tcp_clean_sackreport(tcpb); } INP_WUNLOCK(inpb); } INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET(curvnet); int error, i, m, n, pcb_count; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { m = syncache_pcbcount(); n = V_tcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + ((m + n) + n/8) * sizeof(struct xtcpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&V_tcbinfo); gencnt = V_tcbinfo.ipi_gencnt; n = V_tcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_tcbinfo); m = syncache_pcbcount(); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + (n + m) * sizeof(struct xtcpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n + m; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req, m, &pcb_count); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == NULL) return (ENOMEM); INP_INFO_RLOCK(&V_tcbinfo); for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { /* * XXX: This use of cr_cansee(), introduced with * TCP state changes, is not quite right, but for * now, better than nothing. */ if (inp->inp_vflag & INP_TIMEWAIT) { if (intotw(inp) != NULL) error = cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred); else error = EINVAL; /* Skip this inp. */ } else error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) inp_list[i++] = inp; } INP_RUNLOCK(inp); } INP_INFO_RUNLOCK(&V_tcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; void *inp_ppcb; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof xt; /* XXX should avoid extra copy */ bcopy(inp, &xt.xt_inp, sizeof *inp); inp_ppcb = inp->inp_ppcb; if (inp_ppcb == NULL) bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); else if (inp->inp_vflag & INP_TIMEWAIT) { bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); xt.xt_tp.t_state = TCPS_TIME_WAIT; } else bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); if (inp->inp_socket != NULL) sotoxsocket(inp->inp_socket, &xt.xt_socket); else { bzero(&xt.xt_socket, sizeof xt.xt_socket); xt.xt_socket.xso_protocol = IPPROTO_TCP; } xt.xt_inp.inp_gencnt = inp->inp_gencnt; INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); } else INP_RUNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_INFO_RLOCK(&V_tcbinfo); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + pcb_count; INP_INFO_RUNLOCK(&V_tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); static int tcp_getcred(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET(curvnet); struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); INP_INFO_RLOCK(&V_tcbinfo); inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); if (inp != NULL) { INP_RLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else { INP_INFO_RUNLOCK(&V_tcbinfo); error = ENOENT; } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET(curvnet); INIT_VNET_INET6(curvnet); struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error, mapped = 0; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else return (EINVAL); } INP_INFO_RLOCK(&V_tcbinfo); if (mapped == 1) inp = in_pcblookup_hash(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, 0, NULL); else inp = in6_pcblookup_hash(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); if (inp != NULL) { INP_RLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else { INP_INFO_RUNLOCK(&V_tcbinfo); error = ENOENT; } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { INIT_VNET_INET(curvnet); struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct icmp *icp; struct in_conninfo inc; tcp_seq icmp_tcp_seq; int mtu; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; /* * Redirects don't need to be handled up here. */ else if (PRC_IS_REDIRECT(cmd)) return; /* * Source quench is depreciated. */ else if (cmd == PRC_QUENCH) return; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip != NULL) { icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_WLOCK(&V_tcbinfo); inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, 0, NULL); if (inp != NULL) { INP_WLOCK(inp); if (!(inp->inp_vflag & INP_TIMEWAIT) && !(inp->inp_vflag & INP_DROPPED) && !(inp->inp_socket == NULL)) { icmp_tcp_seq = htonl(th->th_seq); tp = intotcpcb(inp); if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && SEQ_LT(icmp_tcp_seq, tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ bzero(&inc, sizeof(inc)); inc.inc_flags = 0; /* IPv4 */ inc.inc_faddr = faddr; inc.inc_fibnum = inp->inp_inc.inc_fibnum; mtu = ntohs(icp->icmp_nextmtu); /* * If no alternative MTU was * proposed, try the next smaller * one. ip->ip_len has already * been swapped in icmp_input(). */ if (!mtu) mtu = ip_next_mtu(ip->ip_len, 1); if (mtu < max(296, V_tcp_minmss + sizeof(struct tcpiphdr))) mtu = 0; if (!mtu) mtu = V_tcp_mssdflt + sizeof(struct tcpiphdr); /* * Only cache the the MTU if it * is smaller than the interface * or route MTU. tcp_mtudisc() * will do right thing by itself. */ if (mtu <= tcp_maxmtu(&inc, NULL)) tcp_hc_updatemtu(&inc, mtu); } inp = (*notify)(inp, inetctlerrmap[cmd]); } } if (inp != NULL) INP_WUNLOCK(inp); } else { inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; #ifdef INET6 inc.inc_isipv6 = 0; #endif syncache_unreach(&inc, th); } INP_INFO_WUNLOCK(&V_tcbinfo); } else in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); } #ifdef INET6 void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { INIT_VNET_INET(curvnet); struct tcphdr th; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; int off; struct tcp_portonly { u_int16_t th_sport; u_int16_t th_dport; } *thp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; else if (!PRC_IS_REDIRECT(cmd) && ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) return; /* Source quench is depreciated. */ else if (cmd == PRC_QUENCH) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; } if (ip6 != NULL) { struct in_conninfo inc; /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* check if we can safely examine src and dst ports */ if (m->m_pkthdr.len < off + sizeof(*thp)) return; bzero(&th, sizeof(th)); m_copydata(m, off, sizeof(*thp), (caddr_t)&th); in6_pcbnotify(&V_tcbinfo, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, th.th_sport, cmd, NULL, notify); inc.inc_fport = th.th_dport; inc.inc_lport = th.th_sport; inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; inc.inc_isipv6 = 1; INP_INFO_WLOCK(&V_tcbinfo); syncache_unreach(&inc, &th); INP_INFO_WUNLOCK(&V_tcbinfo); } else in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); } #endif /* INET6 */ /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) +#ifdef VIMAGE_GLOBALS static u_char isn_secret[32]; static int isn_last_reseed; static u_int32_t isn_offset, isn_offset_old; static MD5_CTX isn_ctx; +#endif tcp_seq tcp_new_isn(struct tcpcb *tp) { INIT_VNET_INET(tp->t_vnet); u_int32_t md5_buffer[4]; tcp_seq new_isn; INP_WLOCK_ASSERT(tp->t_inpcb); ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { read_random(&V_isn_secret, sizeof(V_isn_secret)); V_isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ MD5Init(&V_isn_ctx); MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, sizeof(struct in6_addr)); MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, sizeof(struct in6_addr)); } else #endif { MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, sizeof(struct in_addr)); MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, sizeof(struct in_addr)); } MD5Update(&V_isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret)); MD5Final((u_char *) &md5_buffer, &V_isn_ctx); new_isn = (tcp_seq) md5_buffer[0]; V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); new_isn += V_isn_offset; ISN_UNLOCK(); return (new_isn); } /* * Increment the offset to the next ISN_BYTES_PER_SECOND / 100 boundary * to keep time flowing at a relatively constant rate. If the random * increments have already pushed us past the projected offset, do nothing. */ static void tcp_isn_tick(void *xtp) { VNET_ITERATOR_DECL(vnet_iter); u_int32_t projected_offset; ISN_LOCK(); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS */ INIT_VNET_INET(curvnet); projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / 100; if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; V_isn_offset_old = V_isn_offset; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); ISN_UNLOCK(); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { #ifdef INVARIANTS INIT_VNET_INET(inp->inp_vnet); #endif struct tcpcb *tp; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value in the route. Also nudge TCP to send something, * since we know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ struct inpcb * tcp_mtudisc(struct inpcb *inp, int errno) { INIT_VNET_INET(inp->inp_vnet); struct tcpcb *tp; struct socket *so; INP_WLOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); tcp_mss_update(tp, -1, NULL, NULL); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); /* If the mss is larger than the socket buffer, decrease the mss. */ if (so->so_snd.sb_hiwat < tp->t_maxseg) tp->t_maxseg = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); V_tcpstat.tcps_mturesent++; tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp); tcp_output_send(tp); return (inp); } /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine * is called by TCP routines that access the rmx structure and by * tcp_mss_update to get the peer/interface MTU. */ u_long tcp_maxmtu(struct in_conninfo *inc, int *flags) { struct route sro; struct sockaddr_in *dst; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); bzero(&sro, sizeof(sro)); if (inc->inc_faddr.s_addr != INADDR_ANY) { dst = (struct sockaddr_in *)&sro.ro_dst; dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = inc->inc_faddr; in_rtalloc_ign(&sro, RTF_CLONING, inc->inc_fibnum); } if (sro.ro_rt != NULL) { ifp = sro.ro_rt->rt_ifp; if (sro.ro_rt->rt_rmx.rmx_mtu == 0) maxmtu = ifp->if_mtu; else maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); /* Report additional interface capabilities. */ if (flags != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) *flags |= CSUM_TSO; } RTFREE(sro.ro_rt); } return (maxmtu); } #ifdef INET6 u_long tcp_maxmtu6(struct in_conninfo *inc, int *flags) { struct route_in6 sro6; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); bzero(&sro6, sizeof(sro6)); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { sro6.ro_dst.sin6_family = AF_INET6; sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); sro6.ro_dst.sin6_addr = inc->inc6_faddr; rtalloc_ign((struct route *)&sro6, RTF_CLONING); } if (sro6.ro_rt != NULL) { ifp = sro6.ro_rt->rt_ifp; if (sro6.ro_rt->rt_rmx.rmx_mtu == 0) maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); else maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, IN6_LINKMTU(sro6.ro_rt->rt_ifp)); /* Report additional interface capabilities. */ if (flags != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) *flags |= CSUM_TSO; } RTFREE(sro6.ro_rt); } return (maxmtu); } #endif /* INET6 */ #ifdef IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t ipsec_hdrsiz_tcp(struct tcpcb *tp) { struct inpcb *inp; struct mbuf *m; size_t hdrsiz; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct tcphdr *th; if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) return (0); MGETHDR(m, M_DONTWAIT, MT_DATA); if (!m) return (0); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); tcpip_fillheaders(inp, ip6, th); hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); tcpip_fillheaders(inp, ip, th); hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } m_free(m); return (hdrsiz); } #endif /* IPSEC */ /* * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING * * This code attempts to calculate the bandwidth-delay product as a * means of determining the optimal window size to maximize bandwidth, * minimize RTT, and avoid the over-allocation of buffers on interfaces and * routers. This code also does a fairly good job keeping RTTs in check * across slow links like modems. We implement an algorithm which is very * similar (but not meant to be) TCP/Vegas. The code operates on the * transmitter side of a TCP connection and so only effects the transmit * side of the connection. * * BACKGROUND: TCP makes no provision for the management of buffer space * at the end points or at the intermediate routers and switches. A TCP * stream, whether using NewReno or not, will eventually buffer as * many packets as it is able and the only reason this typically works is * due to the fairly small default buffers made available for a connection * (typicaly 16K or 32K). As machines use larger windows and/or window * scaling it is now fairly easy for even a single TCP connection to blow-out * all available buffer space not only on the local interface, but on * intermediate routers and switches as well. NewReno makes a misguided * attempt to 'solve' this problem by waiting for an actual failure to occur, * then backing off, then steadily increasing the window again until another * failure occurs, ad-infinitum. This results in terrible oscillation that * is only made worse as network loads increase and the idea of intentionally * blowing out network buffers is, frankly, a terrible way to manage network * resources. * * It is far better to limit the transmit window prior to the failure * condition being achieved. There are two general ways to do this: First * you can 'scan' through different transmit window sizes and locate the * point where the RTT stops increasing, indicating that you have filled the * pipe, then scan backwards until you note that RTT stops decreasing, then * repeat ad-infinitum. This method works in principle but has severe * implementation issues due to RTT variances, timer granularity, and * instability in the algorithm which can lead to many false positives and * create oscillations as well as interact badly with other TCP streams * implementing the same algorithm. * * The second method is to limit the window to the bandwidth delay product * of the link. This is the method we implement. RTT variances and our * own manipulation of the congestion window, bwnd, can potentially * destabilize the algorithm. For this reason we have to stabilize the * elements used to calculate the window. We do this by using the minimum * observed RTT, the long term average of the observed bandwidth, and * by adding two segments worth of slop. It isn't perfect but it is able * to react to changing conditions and gives us a very stable basis on * which to extend the algorithm. */ void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) { INIT_VNET_INET(tp->t_vnet); u_long bw; u_long bwnd; int save_ticks; INP_WLOCK_ASSERT(tp->t_inpcb); /* * If inflight_enable is disabled in the middle of a tcp connection, * make sure snd_bwnd is effectively disabled. */ if (V_tcp_inflight_enable == 0 || tp->t_rttlow < V_tcp_inflight_rttthresh) { tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bandwidth = 0; return; } /* * Figure out the bandwidth. Due to the tick granularity this * is a very rough number and it MUST be averaged over a fairly * long period of time. XXX we need to take into account a link * that is not using all available bandwidth, but for now our * slop will ramp us up if this case occurs and the bandwidth later * increases. * * Note: if ticks rollover 'bw' may wind up negative. We must * effectively reset t_bw_rtttime for this case. */ save_ticks = ticks; if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) return; bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / (save_ticks - tp->t_bw_rtttime); tp->t_bw_rtttime = save_ticks; tp->t_bw_rtseq = ack_seq; if (tp->t_bw_rtttime == 0 || (int)bw < 0) return; bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; tp->snd_bandwidth = bw; /* * Calculate the semi-static bandwidth delay product, plus two maximal * segments. The additional slop puts us squarely in the sweet * spot and also handles the bandwidth run-up case and stabilization. * Without the slop we could be locking ourselves into a lower * bandwidth. * * Situations Handled: * (1) Prevents over-queueing of packets on LANs, especially on * high speed LANs, allowing larger TCP buffers to be * specified, and also does a good job preventing * over-queueing of packets over choke points like modems * (at least for the transmit side). * * (2) Is able to handle changing network loads (bandwidth * drops so bwnd drops, bandwidth increases so bwnd * increases). * * (3) Theoretically should stabilize in the face of multiple * connections implementing the same algorithm (this may need * a little work). * * (4) Stability value (defaults to 20 = 2 maximal packets) can * be adjusted with a sysctl but typically only needs to be * on very slow connections. A value no smaller then 5 * should be used, but only reduce this default if you have * no other choice. */ #define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10; #undef USERTT if (tcp_inflight_debug > 0) { static int ltime; if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { ltime = ticks; printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", tp, bw, tp->t_rttbest, tp->t_srtt, bwnd ); } } if ((long)bwnd < V_tcp_inflight_min) bwnd = V_tcp_inflight_min; if (bwnd > V_tcp_inflight_max) bwnd = V_tcp_inflight_max; if ((long)bwnd < tp->t_maxseg * 2) bwnd = tp->t_maxseg * 2; tp->snd_bwnd = bwnd; } #ifdef TCP_SIGNATURE /* * Callback function invoked by m_apply() to digest TCP segment data * contained within an mbuf chain. */ static int tcp_signature_apply(void *fstate, void *data, u_int len) { MD5Update(fstate, (u_char *)data, len); return (0); } /* * Compute TCP-MD5 hash of a TCP segment. (RFC2385) * * Parameters: * m pointer to head of mbuf chain * _unused * len length of TCP segment data, excluding options * optlen length of TCP segment options * buf pointer to storage for computed MD5 digest * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) * * We do this over ip, tcphdr, segment data, and the key in the SADB. * When called from tcp_input(), we can be sure that th_sum has been * zeroed out and verified already. * * Return 0 if successful, otherwise return -1. * * XXX The key is retrieved from the system's PF_KEY SADB, by keying a * search with the destination IP address, and a 'magic SPI' to be * determined by the application. This is hardcoded elsewhere to 1179 * right now. Another branch of this code exists which uses the SPD to * specify per-application flows but it is unstable. */ int tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, u_char *buf, u_int direction) { union sockaddr_union dst; struct ippseudo ippseudo; MD5_CTX ctx; int doff; struct ip *ip; struct ipovly *ipovly; struct secasvar *sav; struct tcphdr *th; #ifdef INET6 struct ip6_hdr *ip6; struct in6_addr in6; char ip6buf[INET6_ADDRSTRLEN]; uint32_t plen; uint16_t nhdr; #endif u_short savecsum; KASSERT(m != NULL, ("NULL mbuf chain")); KASSERT(buf != NULL, ("NULL signature pointer")); /* Extract the destination from the IP header in the mbuf. */ bzero(&dst, sizeof(union sockaddr_union)); ip = mtod(m, struct ip *); #ifdef INET6 ip6 = NULL; /* Make the compiler happy. */ #endif switch (ip->ip_v) { case IPVERSION: dst.sa.sa_len = sizeof(struct sockaddr_in); dst.sa.sa_family = AF_INET; dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? ip->ip_src : ip->ip_dst; break; #ifdef INET6 case (IPV6_VERSION >> 4): ip6 = mtod(m, struct ip6_hdr *); dst.sa.sa_len = sizeof(struct sockaddr_in6); dst.sa.sa_family = AF_INET6; dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ? ip6->ip6_src : ip6->ip6_dst; break; #endif default: return (EINVAL); /* NOTREACHED */ break; } /* Look up an SADB entry which matches the address of the peer. */ sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI)); if (sav == NULL) { ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__, (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) : #ifdef INET6 (ip->ip_v == (IPV6_VERSION >> 4)) ? ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) : #endif "(unsupported)")); return (EINVAL); } MD5Init(&ctx); /* * Step 1: Update MD5 hash with IP(v6) pseudo-header. * * XXX The ippseudo header MUST be digested in network byte order, * or else we'll fail the regression test. Assume all fields we've * been doing arithmetic on have been in host byte order. * XXX One cannot depend on ipovly->ih_len here. When called from * tcp_output(), the underlying ip_len member has not yet been set. */ switch (ip->ip_v) { case IPVERSION: ipovly = (struct ipovly *)ip; ippseudo.ippseudo_src = ipovly->ih_src; ippseudo.ippseudo_dst = ipovly->ih_dst; ippseudo.ippseudo_pad = 0; ippseudo.ippseudo_p = IPPROTO_TCP; ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + optlen); MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo)); th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip)); doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen; break; #ifdef INET6 /* * RFC 2385, 2.0 Proposal * For IPv6, the pseudo-header is as described in RFC 2460, namely the * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero- * extended next header value (to form 32 bits), and 32-bit segment * length. * Note: Upper-Layer Packet Length comes before Next Header. */ case (IPV6_VERSION >> 4): in6 = ip6->ip6_src; in6_clearscope(&in6); MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); in6 = ip6->ip6_dst; in6_clearscope(&in6); MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); plen = htonl(len + sizeof(struct tcphdr) + optlen); MD5Update(&ctx, (char *)&plen, sizeof(uint32_t)); nhdr = 0; MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); nhdr = IPPROTO_TCP; MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr)); doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen; break; #endif default: return (EINVAL); /* NOTREACHED */ break; } /* * Step 2: Update MD5 hash with TCP header, excluding options. * The TCP checksum must be set to zero. */ savecsum = th->th_sum; th->th_sum = 0; MD5Update(&ctx, (char *)th, sizeof(struct tcphdr)); th->th_sum = savecsum; /* * Step 3: Update MD5 hash with TCP segment data. * Use m_apply() to avoid an early m_pullup(). */ if (len > 0) m_apply(m, doff, len, tcp_signature_apply, &ctx); /* * Step 4: Update MD5 hash with shared secret. */ MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth)); MD5Final(buf, &ctx); key_sa_recordxfer(sav, m); KEY_FREESAV(&sav); return (0); } #endif /* TCP_SIGNATURE */ static int sysctl_drop(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET(curvnet); #ifdef INET6 INIT_VNET_INET6(curvnet); #endif /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; struct tcptw *tw; struct sockaddr_in *fin, *lin; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; struct in6_addr f6, l6; #endif int error; inp = NULL; fin = lin = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; default: return (EINVAL); } INP_INFO_WLOCK(&V_tcbinfo); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup_hash(&V_tcbinfo, &f6, fin6->sin6_port, &l6, lin6->sin6_port, 0, NULL); break; #endif case AF_INET: inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL); break; } if (inp != NULL) { INP_WLOCK(inp); if (inp->inp_vflag & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an * inpcb is present, but its timewait state has been * discarded. For now, don't allow dropping of this * type of inpcb. */ tw = intotw(inp); if (tw != NULL) tcp_twclose(tw, 0); else INP_WUNLOCK(inp); } else if (!(inp->inp_vflag & INP_DROPPED) && !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } else error = ESRCH; INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; struct ip *ip; #ifdef INET6 const struct ip6_hdr *ip6; ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ ip = (struct ip *)ip4hdr; /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ /* Is logging enabled? */ if (tcp_log_debug == 0 && tcp_log_in_vain == 0) return (NULL); s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && inc->inc_isipv6 == 0) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } Index: head/sys/netinet/tcp_syncache.c =================================================================== --- head/sys/netinet/tcp_syncache.c (revision 185087) +++ head/sys/netinet/tcp_syncache.c (revision 185088) @@ -1,1758 +1,1764 @@ /*- * Copyright (c) 2001 McAfee, Inc. * Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * This software was developed for the FreeBSD Project by Jonathan Lemon * and McAfee Research, the Security Research Division of McAfee, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #endif #include #include #include #include #include #include #include #ifdef INET6 #include #endif #ifdef IPSEC #include #ifdef INET6 #include #endif #include #endif /*IPSEC*/ #include #include -static int tcp_syncookies = 1; +#ifdef VIMAGE_GLOBALS +static struct tcp_syncache tcp_syncache; +static int tcp_syncookies; +static int tcp_syncookiesonly; +int tcp_sc_rst_sock_fail; +#endif + SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW, &tcp_syncookies, 0, "Use TCP SYN cookies if the syncache overflows"); -static int tcp_syncookiesonly = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW, &tcp_syncookiesonly, 0, "Use only TCP SYN cookies"); #ifdef TCP_OFFLOAD_DISABLE #define TOEPCB_ISSET(sc) (0) #else #define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL) #endif static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **); static int syncache_respond(struct syncache *); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout); static void syncache_timer(void *); static void syncookie_generate(struct syncache_head *, struct syncache *, u_int32_t *); static struct syncache *syncookie_lookup(struct in_conninfo *, struct syncache_head *, struct syncache *, struct tcpopt *, struct tcphdr *, struct socket *); /* * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. * 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds, * the odds are that the user has given up attempting to connect by then. */ #define SYNCACHE_MAXREXMTS 3 /* Arbitrary values */ #define TCP_SYNCACHE_HASHSIZE 512 #define TCP_SYNCACHE_BUCKETLIMIT 30 -static struct tcp_syncache tcp_syncache; - SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN, tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, tcp_syncache.cache_limit, 0, "Overall entry limit for syncache"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD, tcp_syncache.cache_count, 0, "Current number of entries in syncache"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN, tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW, tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions"); -int tcp_sc_rst_sock_fail = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, CTLFLAG_RW, tcp_sc_rst_sock_fail, 0, "Send reset on socket allocation failure"); static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); #define SYNCACHE_HASH(inc, mask) \ ((V_tcp_syncache.hash_secret ^ \ (inc)->inc_faddr.s_addr ^ \ ((inc)->inc_faddr.s_addr >> 16) ^ \ (inc)->inc_fport ^ (inc)->inc_lport) & mask) #define SYNCACHE_HASH6(inc, mask) \ ((V_tcp_syncache.hash_secret ^ \ (inc)->inc6_faddr.s6_addr32[0] ^ \ (inc)->inc6_faddr.s6_addr32[3] ^ \ (inc)->inc_fport ^ (inc)->inc_lport) & mask) #define ENDPTS_EQ(a, b) ( \ (a)->ie_fport == (b)->ie_fport && \ (a)->ie_lport == (b)->ie_lport && \ (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \ (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \ ) #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0) #define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx) #define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx) #define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED) /* * Requires the syncache entry to be already removed from the bucket list. */ static void syncache_free(struct syncache *sc) { INIT_VNET_INET(curvnet); if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); if (sc->sc_cred) crfree(sc->sc_cred); #ifdef MAC mac_syncache_destroy(&sc->sc_label); #endif uma_zfree(V_tcp_syncache.zone, sc); } void syncache_init(void) { INIT_VNET_INET(curvnet); int i; + + V_tcp_syncookies = 1; + V_tcp_syncookiesonly = 0; + V_tcp_sc_rst_sock_fail = 1; V_tcp_syncache.cache_count = 0; V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; V_tcp_syncache.hash_secret = arc4random(); TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", &V_tcp_syncache.hashsize); TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", &V_tcp_syncache.bucket_limit); if (!powerof2(V_tcp_syncache.hashsize) || V_tcp_syncache.hashsize == 0) { printf("WARNING: syncache hash size is not a power of 2.\n"); V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; } V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1; /* Set limits. */ V_tcp_syncache.cache_limit = V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit; TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", &V_tcp_syncache.cache_limit); /* Allocate the hash table. */ V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize * sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO); /* Initialize the hash buckets. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", NULL, MTX_DEF); callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer, &V_tcp_syncache.hashbase[i].sch_mtx, 0); V_tcp_syncache.hashbase[i].sch_length = 0; } /* Create the syncache entry zone. */ V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit); } /* * Inserts a syncache entry into the specified bucket row. * Locks and unlocks the syncache_head autonomously. */ static void syncache_insert(struct syncache *sc, struct syncache_head *sch) { INIT_VNET_INET(sch->sch_vnet); struct syncache *sc2; SCH_LOCK(sch); /* * Make sure that we don't overflow the per-bucket limit. * If the bucket is full, toss the oldest element. */ if (sch->sch_length >= V_tcp_syncache.bucket_limit) { KASSERT(!TAILQ_EMPTY(&sch->sch_bucket), ("sch->sch_length incorrect")); sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head); syncache_drop(sc2, sch); V_tcpstat.tcps_sc_bucketoverflow++; } /* Put it into the bucket. */ TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash); sch->sch_length++; /* Reinitialize the bucket row's timer. */ if (sch->sch_length == 1) sch->sch_nextc = ticks + INT_MAX; syncache_timeout(sc, sch, 1); SCH_UNLOCK(sch); V_tcp_syncache.cache_count++; V_tcpstat.tcps_sc_added++; } /* * Remove and free entry from syncache bucket row. * Expects locked syncache head. */ static void syncache_drop(struct syncache *sc, struct syncache_head *sch) { INIT_VNET_INET(sch->sch_vnet); SCH_LOCK_ASSERT(sch); TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifndef TCP_OFFLOAD_DISABLE if (sc->sc_tu) sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb); #endif syncache_free(sc); V_tcp_syncache.cache_count--; } /* * Engage/reengage time on bucket row. */ static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) { sc->sc_rxttime = ticks + TCPTV_RTOBASE * (tcp_backoff[sc->sc_rxmits]); sc->sc_rxmits++; if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { sch->sch_nextc = sc->sc_rxttime; if (docallout) callout_reset(&sch->sch_timer, sch->sch_nextc - ticks, syncache_timer, (void *)sch); } } /* * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. * If we have retransmitted an entry the maximum number of times, expire it. * One separate timer for each bucket row. */ static void syncache_timer(void *xsch) { struct syncache_head *sch = (struct syncache_head *)xsch; INIT_VNET_INET(sch->sch_vnet); struct syncache *sc, *nsc; int tick = ticks; char *s; /* NB: syncache_head has already been locked by the callout. */ SCH_LOCK_ASSERT(sch); /* * In the following cycle we may remove some entries and/or * advance some timeouts, so re-initialize the bucket timer. */ sch->sch_nextc = tick + INT_MAX; TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) { /* * We do not check if the listen socket still exists * and accept the case where the listen socket may be * gone by the time we resend the SYN/ACK. We do * not expect this to happens often. If it does, * then the RST will be sent by the time the remote * host does the SYN/ACK->ACK. */ if (TSTMP_GT(sc->sc_rxttime, tick)) { if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) sch->sch_nextc = sc->sc_rxttime; continue; } if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) { if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Retransmits exhausted, " "giving up and removing syncache entry\n", s, __func__); free(s, M_TCPLOG); } syncache_drop(sc, sch); V_tcpstat.tcps_sc_stale++; continue; } if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Response timeout, " "retransmitting (%u) SYN|ACK\n", s, __func__, sc->sc_rxmits); free(s, M_TCPLOG); } (void) syncache_respond(sc); V_tcpstat.tcps_sc_retransmitted++; syncache_timeout(sc, sch, 0); } if (!TAILQ_EMPTY(&(sch)->sch_bucket)) callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick, syncache_timer, (void *)(sch)); } /* * Find an entry in the syncache. * Returns always with locked syncache_head plus a matching entry or NULL. */ struct syncache * syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) { INIT_VNET_INET(curvnet); struct syncache *sc; struct syncache_head *sch; #ifdef INET6 if (inc->inc_isipv6) { sch = &V_tcp_syncache.hashbase[ SYNCACHE_HASH6(inc, V_tcp_syncache.hashmask)]; *schp = sch; SCH_LOCK(sch); /* Circle through bucket row to find matching entry. */ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) return (sc); } } else #endif { sch = &V_tcp_syncache.hashbase[ SYNCACHE_HASH(inc, V_tcp_syncache.hashmask)]; *schp = sch; SCH_LOCK(sch); /* Circle through bucket row to find matching entry. */ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { #ifdef INET6 if (sc->sc_inc.inc_isipv6) continue; #endif if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) return (sc); } } SCH_LOCK_ASSERT(*schp); return (NULL); /* always returns with locked sch */ } /* * This function is called when we get a RST for a * non-existent connection, so that we can see if the * connection is in the syn cache. If it is, zap it. */ void syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th) { INIT_VNET_INET(curvnet); struct syncache *sc; struct syncache_head *sch; char *s = NULL; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); /* * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags. * See RFC 793 page 65, section SEGMENT ARRIVES. */ if (th->th_flags & (TH_ACK|TH_SYN|TH_FIN)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or " "FIN flag set, segment ignored\n", s, __func__); V_tcpstat.tcps_badrst++; goto done; } /* * No corresponding connection was found in syncache. * If syncookies are enabled and possibly exclusively * used, or we are under memory pressure, a valid RST * may not find a syncache entry. In that case we're * done and no SYN|ACK retransmissions will happen. * Otherwise the the RST was misdirected or spoofed. */ if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST without matching " "syncache entry (possibly syncookie only), " "segment ignored\n", s, __func__); V_tcpstat.tcps_badrst++; goto done; } /* * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. */ if (SEQ_GEQ(th->th_seq, sc->sc_irs) && SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) { syncache_drop(sc, sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, " "connection attempt aborted by remote endpoint\n", s, __func__); V_tcpstat.tcps_sc_reset++; } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != " "IRS %u (+WND %u), segment ignored\n", s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd); V_tcpstat.tcps_badrst++; } done: if (s != NULL) free(s, M_TCPLOG); SCH_UNLOCK(sch); } void syncache_badack(struct in_conninfo *inc) { INIT_VNET_INET(curvnet); struct syncache *sc; struct syncache_head *sch; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { syncache_drop(sc, sch); V_tcpstat.tcps_sc_badack++; } SCH_UNLOCK(sch); } void syncache_unreach(struct in_conninfo *inc, struct tcphdr *th) { INIT_VNET_INET(curvnet); struct syncache *sc; struct syncache_head *sch; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc == NULL) goto done; /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ if (ntohl(th->th_seq) != sc->sc_iss) goto done; /* * If we've rertransmitted 3 times and this is our second error, * we remove the entry. Otherwise, we allow it to continue on. * This prevents us from incorrectly nuking an entry during a * spurious network outage. * * See tcp_notify(). */ if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) { sc->sc_flags |= SCF_UNREACH; goto done; } syncache_drop(sc, sch); V_tcpstat.tcps_sc_unreach++; done: SCH_UNLOCK(sch); } /* * Build a new TCP socket structure from a syncache entry. */ static struct socket * syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) { INIT_VNET_INET(lso->so_vnet); struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; char *s; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* * Ok, create the full blown connection, and set things up * as they would have been set up if we had created the * connection when the SYN arrived. If we can't create * the connection, abort it. */ so = sonewconn(lso, SS_ISCONNECTED); if (so == NULL) { /* * Drop the connection; we will either send a RST or * have the peer retransmit its SYN again after its * RTO and try again. */ V_tcpstat.tcps_listendrop++; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Socket create failed " "due to limits or memory shortage\n", s, __func__); free(s, M_TCPLOG); } goto abort2; } #ifdef MAC SOCK_LOCK(so); mac_socketpeer_set_from_mbuf(m, so); SOCK_UNLOCK(so); #endif inp = sotoinpcb(so); inp->inp_inc.inc_fibnum = sc->sc_inc.inc_fibnum; so->so_fibnum = sc->sc_inc.inc_fibnum; INP_WLOCK(inp); /* Insert new socket into PCB hash list. */ inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6; #ifdef INET6 if (sc->sc_inc.inc_isipv6) { inp->in6p_laddr = sc->sc_inc.inc6_laddr; } else { inp->inp_vflag &= ~INP_IPV6; inp->inp_vflag |= INP_IPV4; #endif inp->inp_laddr = sc->sc_inc.inc_laddr; #ifdef INET6 } #endif inp->inp_lport = sc->sc_inc.inc_lport; if (in_pcbinshash(inp) != 0) { /* * Undo the assignments above if we failed to * put the PCB on the hash lists. */ #ifdef INET6 if (sc->sc_inc.inc_isipv6) inp->in6p_laddr = in6addr_any; else #endif inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; goto abort; } #ifdef IPSEC /* Copy old policy into new socket's. */ if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp)) printf("syncache_socket: could not copy policy\n"); #endif #ifdef INET6 if (sc->sc_inc.inc_isipv6) { struct inpcb *oinp = sotoinpcb(lso); struct in6_addr laddr6; struct sockaddr_in6 sin6; /* * Inherit socket options from the listening socket. * Note that in6p_inputopts are not (and should not be) * copied, since it stores previously received options and is * used to detect if each new option is different than the * previous one and hence should be passed to a user. * If we copied in6p_inputopts, a user would not be able to * receive options just after calling the accept system call. */ inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS; if (oinp->in6p_outputopts) inp->in6p_outputopts = ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = sc->sc_inc.inc6_faddr; sin6.sin6_port = sc->sc_inc.inc_fport; sin6.sin6_flowinfo = sin6.sin6_scope_id = 0; laddr6 = inp->in6p_laddr; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = sc->sc_inc.inc6_laddr; if (in6_pcbconnect(inp, (struct sockaddr *)&sin6, thread0.td_ucred)) { inp->in6p_laddr = laddr6; goto abort; } /* Override flowlabel from in6_pcbconnect. */ inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK; inp->in6p_flowinfo |= sc->sc_flowlabel; } else #endif { struct in_addr laddr; struct sockaddr_in sin; inp->inp_options = (m) ? ip_srcroute(m) : NULL; if (inp->inp_options == NULL) { inp->inp_options = sc->sc_ipopts; sc->sc_ipopts = NULL; } sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_addr = sc->sc_inc.inc_faddr; sin.sin_port = sc->sc_inc.inc_fport; bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = sc->sc_inc.inc_laddr; if (in_pcbconnect(inp, (struct sockaddr *)&sin, thread0.td_ucred)) { inp->inp_laddr = laddr; goto abort; } } tp = intotcpcb(inp); tp->t_state = TCPS_SYN_RECEIVED; tp->iss = sc->sc_iss; tp->irs = sc->sc_irs; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); tp->snd_wl1 = sc->sc_irs; tp->snd_max = tp->iss + 1; tp->snd_nxt = tp->iss + 1; tp->rcv_up = sc->sc_irs + 1; tp->rcv_wnd = sc->sc_wnd; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); if (sc->sc_flags & SCF_NOOPT) tp->t_flags |= TF_NOOPT; else { if (sc->sc_flags & SCF_WINSCALE) { tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; tp->snd_scale = sc->sc_requested_s_scale; tp->request_r_scale = sc->sc_requested_r_scale; } if (sc->sc_flags & SCF_TIMESTAMP) { tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; tp->ts_recent = sc->sc_tsreflect; tp->ts_recent_age = ticks; tp->ts_offset = sc->sc_tsoff; } #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) tp->t_flags |= TF_SIGNATURE; #endif if (sc->sc_flags & SCF_SACK) tp->t_flags |= TF_SACK_PERMIT; } if (sc->sc_flags & SCF_ECN) tp->t_flags |= TF_ECN_PERMIT; /* * Set up MSS and get cached values from tcp_hostcache. * This might overwrite some of the defaults we just set. */ tcp_mss(tp, sc->sc_peer_mss); /* * If the SYN,ACK was retransmitted, reset cwnd to 1 segment. */ if (sc->sc_rxmits) tp->snd_cwnd = tp->t_maxseg; tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); INP_WUNLOCK(inp); V_tcpstat.tcps_accepts++; return (so); abort: INP_WUNLOCK(inp); abort2: if (so != NULL) soabort(so); return (NULL); } /* * This function gets called when we receive an ACK for a * socket in the LISTEN state. We look up the connection * in the syncache, and if its there, we pull it out of * the cache and turn it into a full-blown connection in * the SYN-RECEIVED state. */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct socket **lsop, struct mbuf *m) { INIT_VNET_INET(curvnet); struct syncache *sc; struct syncache_head *sch; struct syncache scs; char *s; /* * Global TCP locks are held because we manipulate the PCB lists * and create a new socket. */ INP_INFO_WLOCK_ASSERT(&V_tcbinfo); KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, ("%s: can handle only ACK", __func__)); sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc == NULL) { /* * There is no syncache entry, so see if this ACK is * a returning syncookie. To do this, first: * A. See if this socket has had a syncache entry dropped in * the past. We don't want to accept a bogus syncookie * if we've never received a SYN. * B. check that the syncookie is valid. If it is, then * cobble up a fake syncache entry, and return. */ if (!tcp_syncookies) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (syncookies disabled)\n", s, __func__); goto failed; } bzero(&scs, sizeof(scs)); sc = syncookie_lookup(inc, sch, &scs, to, th, *lsop); SCH_UNLOCK(sch); if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Segment failed " "SYNCOOKIE authentication, segment rejected " "(probably spoofed)\n", s, __func__); goto failed; } } else { /* Pull out the entry to unlock the bucket row. */ TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; V_tcp_syncache.cache_count--; SCH_UNLOCK(sch); } /* * Segment validation: * ACK must match our initial sequence number + 1 (the SYN|ACK). */ if (th->th_ack != sc->sc_iss + 1 && !TOEPCB_ISSET(sc)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment " "rejected\n", s, __func__, th->th_ack, sc->sc_iss); goto failed; } /* * The SEQ must fall in the window starting at the received * initial receive sequence number + 1 (the SYN). */ if ((SEQ_LEQ(th->th_seq, sc->sc_irs) || SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) && !TOEPCB_ISSET(sc)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment " "rejected\n", s, __func__, th->th_seq, sc->sc_irs); goto failed; } if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "segment rejected\n", s, __func__); goto failed; } /* * If timestamps were negotiated the reflected timestamp * must be equal to what we actually sent in the SYN|ACK. */ if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts && !TOEPCB_ISSET(sc)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, " "segment rejected\n", s, __func__, to->to_tsecr, sc->sc_ts); goto failed; } *lsop = syncache_socket(sc, *lsop, m); if (*lsop == NULL) V_tcpstat.tcps_sc_aborted++; else V_tcpstat.tcps_sc_completed++; /* how do we find the inp for the new socket? */ if (sc != &scs) syncache_free(sc); return (1); failed: if (sc != NULL && sc != &scs) syncache_free(sc); if (s != NULL) free(s, M_TCPLOG); *lsop = NULL; return (0); } int tcp_offload_syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct socket **lsop, struct mbuf *m) { int rc; INP_INFO_WLOCK(&V_tcbinfo); rc = syncache_expand(inc, to, th, lsop, m); INP_INFO_WUNLOCK(&V_tcbinfo); return (rc); } /* * Given a LISTEN socket and an inbound SYN request, add * this to the syn cache, and send back a segment: * * to the source. * * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. * Doing so would require that we hold onto the data and deliver it * to the application. However, if we are the target of a SYN-flood * DoS attack, an attacker could send data which would eventually * consume all available buffer space if it were ACKed. By not ACKing * the data, we avoid this DoS scenario. */ static void _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket **lsop, struct mbuf *m, struct toe_usrreqs *tu, void *toepcb) { INIT_VNET_INET(inp->inp_vnet); struct tcpcb *tp; struct socket *so; struct syncache *sc = NULL; struct syncache_head *sch; struct mbuf *ipopts = NULL; u_int32_t flowtmp; int win, sb_hiwat, ip_ttl, ip_tos, noopt; char *s; #ifdef INET6 int autoflowlabel = 0; #endif #ifdef MAC struct label *maclabel; #endif struct syncache scs; struct ucred *cred; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* listen socket */ KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, ("%s: unexpected tcp flags", __func__)); /* * Combine all so/tp operations very early to drop the INP lock as * soon as possible. */ so = *lsop; tp = sototcpcb(so); cred = crhold(so->so_cred); #ifdef INET6 if (inc->inc_isipv6 && (inp->in6p_flags & IN6P_AUTOFLOWLABEL)) autoflowlabel = 1; #endif ip_ttl = inp->inp_ip_ttl; ip_tos = inp->inp_ip_tos; win = sbspace(&so->so_rcv); sb_hiwat = so->so_rcv.sb_hiwat; noopt = (tp->t_flags & TF_NOOPT); /* By the time we drop the lock these should no longer be used. */ so = NULL; tp = NULL; #ifdef MAC if (mac_syncache_init(&maclabel) != 0) { INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_tcbinfo); goto done; } else mac_syncache_create(maclabel, inp); #endif INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_tcbinfo); /* * Remember the IP options, if any. */ #ifdef INET6 if (!inc->inc_isipv6) #endif ipopts = (m) ? ip_srcroute(m) : NULL; /* * See if we already have an entry for this connection. * If we do, resend the SYN,ACK, and reset the retransmit timer. * * XXX: should the syncache be re-initialized with the contents * of the new SYN here (which may have different options?) * * XXX: We do not check the sequence number to see if this is a * real retransmit or a new connection attempt. The question is * how to handle such a case; either ignore it as spoofed, or * drop the current entry and create a new one? */ sc = syncache_lookup(inc, &sch); /* returns locked entry */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { #ifndef TCP_OFFLOAD_DISABLE if (sc->sc_tu) sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT, sc->sc_toepcb); #endif V_tcpstat.tcps_sc_dupsyn++; if (ipopts) { /* * If we were remembering a previous source route, * forget it and use the new one we've been given. */ if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); sc->sc_ipopts = ipopts; } /* * Update timestamp if present. */ if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) sc->sc_tsreflect = to->to_tsval; else sc->sc_flags &= ~SCF_TIMESTAMP; #ifdef MAC /* * Since we have already unconditionally allocated label * storage, free it up. The syncache entry will already * have an initialized label we can use. */ mac_syncache_destroy(&maclabel); KASSERT(sc->sc_label != NULL, ("%s: label not initialized", __func__)); #endif /* Retransmit SYN|ACK and reset retransmit count. */ if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Received duplicate SYN, " "resetting timer and retransmitting SYN|ACK\n", s, __func__); free(s, M_TCPLOG); } if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); V_tcpstat.tcps_sndacks++; V_tcpstat.tcps_sndtotal++; } SCH_UNLOCK(sch); goto done; } sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { /* * The zone allocator couldn't provide more entries. * Treat this as if the cache was full; drop the oldest * entry and insert the new one. */ V_tcpstat.tcps_sc_zonefail++; if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) syncache_drop(sc, sch); sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { if (tcp_syncookies) { bzero(&scs, sizeof(scs)); sc = &scs; } else { SCH_UNLOCK(sch); if (ipopts) (void) m_free(ipopts); goto done; } } } /* * Fill in the syncache values. */ #ifdef MAC sc->sc_label = maclabel; #endif sc->sc_cred = cred; cred = NULL; sc->sc_ipopts = ipopts; sc->sc_inc.inc_fibnum = inp->inp_inc.inc_fibnum; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); #ifdef INET6 if (!inc->inc_isipv6) #endif { sc->sc_ip_tos = ip_tos; sc->sc_ip_ttl = ip_ttl; } #ifndef TCP_OFFLOAD_DISABLE sc->sc_tu = tu; sc->sc_toepcb = toepcb; #endif sc->sc_irs = th->th_seq; sc->sc_iss = arc4random(); sc->sc_flags = 0; sc->sc_flowlabel = 0; /* * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN]. * win was derived from socket earlier in the function. */ win = imax(win, 0); win = imin(win, TCP_MAXWIN); sc->sc_wnd = win; if (V_tcp_do_rfc1323) { /* * A timestamp received in a SYN makes * it ok to send timestamp requests and replies. */ if (to->to_flags & TOF_TS) { sc->sc_tsreflect = to->to_tsval; sc->sc_ts = ticks; sc->sc_flags |= SCF_TIMESTAMP; } if (to->to_flags & TOF_SCALE) { int wscale = 0; /* * Pick the smallest possible scaling factor that * will still allow us to scale up to sb_max, aka * kern.ipc.maxsockbuf. * * We do this because there are broken firewalls that * will corrupt the window scale option, leading to * the other endpoint believing that our advertised * window is unscaled. At scale factors larger than * 5 the unscaled window will drop below 1500 bytes, * leading to serious problems when traversing these * broken firewalls. * * With the default maxsockbuf of 256K, a scale factor * of 3 will be chosen by this algorithm. Those who * choose a larger maxsockbuf should watch out * for the compatiblity problems mentioned above. * * RFC1323: The Window field in a SYN (i.e., a * or ) segment itself is never scaled. */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) wscale++; sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = to->to_wscale; sc->sc_flags |= SCF_WINSCALE; } } #ifdef TCP_SIGNATURE /* * If listening socket requested TCP digests, and received SYN * contains the option, flag this in the syncache so that * syncache_respond() will do the right thing with the SYN+ACK. * XXX: Currently we always record the option by default and will * attempt to use it in syncache_respond(). */ if (to->to_flags & TOF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; #endif if (to->to_flags & TOF_SACKPERM) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_MSS) sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */ if (noopt) sc->sc_flags |= SCF_NOOPT; if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn) sc->sc_flags |= SCF_ECN; if (tcp_syncookies) { syncookie_generate(sch, sc, &flowtmp); #ifdef INET6 if (autoflowlabel) sc->sc_flowlabel = flowtmp; #endif } else { #ifdef INET6 if (autoflowlabel) sc->sc_flowlabel = (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); #endif } SCH_UNLOCK(sch); /* * Do a standard 3-way handshake. */ if (TOEPCB_ISSET(sc) || syncache_respond(sc) == 0) { if (tcp_syncookies && tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) syncache_insert(sc, sch); /* locks and unlocks sch */ V_tcpstat.tcps_sndacks++; V_tcpstat.tcps_sndtotal++; } else { if (sc != &scs) syncache_free(sc); V_tcpstat.tcps_sc_dropped++; } done: if (cred != NULL) crfree(cred); #ifdef MAC if (sc == &scs) mac_syncache_destroy(&maclabel); #endif if (m) { *lsop = NULL; m_freem(m); } } static int syncache_respond(struct syncache *sc) { INIT_VNET_INET(curvnet); struct ip *ip = NULL; struct mbuf *m; struct tcphdr *th; int optlen, error; u_int16_t hlen, tlen, mssopt; struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; #endif hlen = #ifdef INET6 (sc->sc_inc.inc_isipv6) ? sizeof(struct ip6_hdr) : #endif sizeof(struct ip); tlen = hlen + sizeof(struct tcphdr); /* Determine MSS we advertize to other end of connection. */ mssopt = tcp_mssopt(&sc->sc_inc); if (sc->sc_peer_mss) mssopt = max( min(sc->sc_peer_mss, mssopt), V_tcp_minmss); /* XXX: Assume that the entire packet will fit in a header mbuf. */ KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN, ("syncache: mbuf too small")); /* Create the IP+TCP header from scratch. */ m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); #ifdef MAC mac_syncache_create_mbuf(sc->sc_label, m); #endif m->m_data += max_linkhdr; m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef INET6 if (sc->sc_inc.inc_isipv6) { ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_src = sc->sc_inc.inc6_laddr; ip6->ip6_dst = sc->sc_inc.inc6_faddr; ip6->ip6_plen = htons(tlen - hlen); /* ip6_hlim is set after checksum */ ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK; ip6->ip6_flow |= sc->sc_flowlabel; th = (struct tcphdr *)(ip6 + 1); } else #endif { ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_len = tlen; ip->ip_id = 0; ip->ip_off = 0; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = sc->sc_inc.inc_laddr; ip->ip_dst = sc->sc_inc.inc_faddr; ip->ip_ttl = sc->sc_ip_ttl; ip->ip_tos = sc->sc_ip_tos; /* * See if we should do MTU discovery. Route lookups are * expensive, so we will only unset the DF bit if: * * 1) path_mtu_discovery is disabled * 2) the SCF_UNREACH flag has been set */ if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) ip->ip_off |= IP_DF; th = (struct tcphdr *)(ip + 1); } th->th_sport = sc->sc_inc.inc_lport; th->th_dport = sc->sc_inc.inc_fport; th->th_seq = htonl(sc->sc_iss); th->th_ack = htonl(sc->sc_irs + 1); th->th_off = sizeof(struct tcphdr) >> 2; th->th_x2 = 0; th->th_flags = TH_SYN|TH_ACK; th->th_win = htons(sc->sc_wnd); th->th_urp = 0; if (sc->sc_flags & SCF_ECN) { th->th_flags |= TH_ECE; V_tcpstat.tcps_ecn_shs++; } /* Tack on the TCP options. */ if ((sc->sc_flags & SCF_NOOPT) == 0) { to.to_flags = 0; to.to_mss = mssopt; to.to_flags = TOF_MSS; if (sc->sc_flags & SCF_WINSCALE) { to.to_wscale = sc->sc_requested_r_scale; to.to_flags |= TOF_SCALE; } if (sc->sc_flags & SCF_TIMESTAMP) { /* Virgin timestamp or TCP cookie enhanced one. */ to.to_tsval = sc->sc_ts; to.to_tsecr = sc->sc_tsreflect; to.to_flags |= TOF_TS; } if (sc->sc_flags & SCF_SACK) to.to_flags |= TOF_SACKPERM; #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif optlen = tcp_addoptions(&to, (u_char *)(th + 1)); /* Adjust headers by option size. */ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; m->m_len += optlen; m->m_pkthdr.len += optlen; #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) tcp_signature_compute(m, 0, 0, optlen, to.to_signature, IPSEC_DIR_OUTBOUND); #endif #ifdef INET6 if (sc->sc_inc.inc_isipv6) ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen); else #endif ip->ip_len += optlen; } else optlen = 0; #ifdef INET6 if (sc->sc_inc.inc_isipv6) { th->th_sum = 0; th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen + optlen - hlen); ip6->ip6_hlim = in6_selecthlim(NULL, NULL); error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } else #endif { th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(tlen + optlen - hlen + IPPROTO_TCP)); m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); } return (error); } void syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket **lsop, struct mbuf *m) { _syncache_add(inc, to, th, inp, lsop, m, NULL, NULL); } void tcp_offload_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket **lsop, struct toe_usrreqs *tu, void *toepcb) { INIT_VNET_INET(curvnet); INP_INFO_WLOCK(&V_tcbinfo); INP_WLOCK(inp); _syncache_add(inc, to, th, inp, lsop, NULL, tu, toepcb); } /* * The purpose of SYN cookies is to avoid keeping track of all SYN's we * receive and to be able to handle SYN floods from bogus source addresses * (where we will never receive any reply). SYN floods try to exhaust all * our memory and available slots in the SYN cache table to cause a denial * of service to legitimate users of the local host. * * The idea of SYN cookies is to encode and include all necessary information * about the connection setup state within the SYN-ACK we send back and thus * to get along without keeping any local state until the ACK to the SYN-ACK * arrives (if ever). Everything we need to know should be available from * the information we encoded in the SYN-ACK. * * More information about the theory behind SYN cookies and its first * discussion and specification can be found at: * http://cr.yp.to/syncookies.html (overview) * http://cr.yp.to/syncookies/archive (gory details) * * This implementation extends the orginal idea and first implementation * of FreeBSD by using not only the initial sequence number field to store * information but also the timestamp field if present. This way we can * keep track of the entire state we need to know to recreate the session in * its original form. Almost all TCP speakers implement RFC1323 timestamps * these days. For those that do not we still have to live with the known * shortcomings of the ISN only SYN cookies. * * Cookie layers: * * Initial sequence number we send: * 31|................................|0 * DDDDDDDDDDDDDDDDDDDDDDDDDMMMRRRP * D = MD5 Digest (first dword) * M = MSS index * R = Rotation of secret * P = Odd or Even secret * * The MD5 Digest is computed with over following parameters: * a) randomly rotated secret * b) struct in_conninfo containing the remote/local ip/port (IPv4&IPv6) * c) the received initial sequence number from remote host * d) the rotation offset and odd/even bit * * Timestamp we send: * 31|................................|0 * DDDDDDDDDDDDDDDDDDDDDDSSSSRRRRA5 * D = MD5 Digest (third dword) (only as filler) * S = Requested send window scale * R = Requested receive window scale * A = SACK allowed * 5 = TCP-MD5 enabled (not implemented yet) * XORed with MD5 Digest (forth dword) * * The timestamp isn't cryptographically secure and doesn't need to be. * The double use of the MD5 digest dwords ties it to a specific remote/ * local host/port, remote initial sequence number and our local time * limited secret. A received timestamp is reverted (XORed) and then * the contained MD5 dword is compared to the computed one to ensure the * timestamp belongs to the SYN-ACK we sent. The other parameters may * have been tampered with but this isn't different from supplying bogus * values in the SYN in the first place. * * Some problems with SYN cookies remain however: * Consider the problem of a recreated (and retransmitted) cookie. If the * original SYN was accepted, the connection is established. The second * SYN is inflight, and if it arrives with an ISN that falls within the * receive window, the connection is killed. * * Notes: * A heuristic to determine when to accept syn cookies is not necessary. * An ACK flood would cause the syncookie verification to be attempted, * but a SYN flood causes syncookies to be generated. Both are of equal * cost, so there's no point in trying to optimize the ACK flood case. * Also, if you don't process certain ACKs for some reason, then all someone * would have to do is launch a SYN and ACK flood at the same time, which * would stop cookie verification and defeat the entire purpose of syncookies. */ static int tcp_sc_msstab[] = { 0, 256, 468, 536, 996, 1452, 1460, 8960 }; static void syncookie_generate(struct syncache_head *sch, struct syncache *sc, u_int32_t *flowlabel) { INIT_VNET_INET(curvnet); MD5_CTX ctx; u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)]; u_int32_t data; u_int32_t *secbits; u_int off, pmss, mss; int i; SCH_LOCK_ASSERT(sch); /* Which of the two secrets to use. */ secbits = sch->sch_oddeven ? sch->sch_secbits_odd : sch->sch_secbits_even; /* Reseed secret if too old. */ if (sch->sch_reseed < time_uptime) { sch->sch_oddeven = sch->sch_oddeven ? 0 : 1; /* toggle */ secbits = sch->sch_oddeven ? sch->sch_secbits_odd : sch->sch_secbits_even; for (i = 0; i < SYNCOOKIE_SECRET_SIZE; i++) secbits[i] = arc4random(); sch->sch_reseed = time_uptime + SYNCOOKIE_LIFETIME; } /* Secret rotation offset. */ off = sc->sc_iss & 0x7; /* iss was randomized before */ /* Maximum segment size calculation. */ pmss = max( min(sc->sc_peer_mss, tcp_mssopt(&sc->sc_inc)), V_tcp_minmss); for (mss = sizeof(tcp_sc_msstab) / sizeof(int) - 1; mss > 0; mss--) if (tcp_sc_msstab[mss] <= pmss) break; /* Fold parameters and MD5 digest into the ISN we will send. */ data = sch->sch_oddeven;/* odd or even secret, 1 bit */ data |= off << 1; /* secret offset, derived from iss, 3 bits */ data |= mss << 4; /* mss, 3 bits */ MD5Init(&ctx); MD5Update(&ctx, ((u_int8_t *)secbits) + off, SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off); MD5Update(&ctx, secbits, off); MD5Update(&ctx, &sc->sc_inc, sizeof(sc->sc_inc)); MD5Update(&ctx, &sc->sc_irs, sizeof(sc->sc_irs)); MD5Update(&ctx, &data, sizeof(data)); MD5Final((u_int8_t *)&md5_buffer, &ctx); data |= (md5_buffer[0] << 7); sc->sc_iss = data; #ifdef INET6 *flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK; #endif /* Additional parameters are stored in the timestamp if present. */ if (sc->sc_flags & SCF_TIMESTAMP) { data = ((sc->sc_flags & SCF_SIGNATURE) ? 1 : 0); /* TCP-MD5, 1 bit */ data |= ((sc->sc_flags & SCF_SACK) ? 1 : 0) << 1; /* SACK, 1 bit */ data |= sc->sc_requested_s_scale << 2; /* SWIN scale, 4 bits */ data |= sc->sc_requested_r_scale << 6; /* RWIN scale, 4 bits */ data |= md5_buffer[2] << 10; /* more digest bits */ data ^= md5_buffer[3]; sc->sc_ts = data; sc->sc_tsoff = data - ticks; /* after XOR */ } V_tcpstat.tcps_sc_sendcookie++; } static struct syncache * syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcpopt *to, struct tcphdr *th, struct socket *so) { INIT_VNET_INET(curvnet); MD5_CTX ctx; u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)]; u_int32_t data = 0; u_int32_t *secbits; tcp_seq ack, seq; int off, mss, wnd, flags; SCH_LOCK_ASSERT(sch); /* * Pull information out of SYN-ACK/ACK and * revert sequence number advances. */ ack = th->th_ack - 1; seq = th->th_seq - 1; off = (ack >> 1) & 0x7; mss = (ack >> 4) & 0x7; flags = ack & 0x7f; /* Which of the two secrets to use. */ secbits = (flags & 0x1) ? sch->sch_secbits_odd : sch->sch_secbits_even; /* * The secret wasn't updated for the lifetime of a syncookie, * so this SYN-ACK/ACK is either too old (replay) or totally bogus. */ if (sch->sch_reseed + SYNCOOKIE_LIFETIME < time_uptime) { return (NULL); } /* Recompute the digest so we can compare it. */ MD5Init(&ctx); MD5Update(&ctx, ((u_int8_t *)secbits) + off, SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off); MD5Update(&ctx, secbits, off); MD5Update(&ctx, inc, sizeof(*inc)); MD5Update(&ctx, &seq, sizeof(seq)); MD5Update(&ctx, &flags, sizeof(flags)); MD5Final((u_int8_t *)&md5_buffer, &ctx); /* Does the digest part of or ACK'ed ISS match? */ if ((ack & (~0x7f)) != (md5_buffer[0] << 7)) return (NULL); /* Does the digest part of our reflected timestamp match? */ if (to->to_flags & TOF_TS) { data = md5_buffer[3] ^ to->to_tsecr; if ((data & (~0x3ff)) != (md5_buffer[2] << 10)) return (NULL); } /* Fill in the syncache values. */ bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); sc->sc_ipopts = NULL; sc->sc_irs = seq; sc->sc_iss = ack; #ifdef INET6 if (inc->inc_isipv6) { if (sotoinpcb(so)->in6p_flags & IN6P_AUTOFLOWLABEL) sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK; } else #endif { sc->sc_ip_ttl = sotoinpcb(so)->inp_ip_ttl; sc->sc_ip_tos = sotoinpcb(so)->inp_ip_tos; } /* Additional parameters that were encoded in the timestamp. */ if (data) { sc->sc_flags |= SCF_TIMESTAMP; sc->sc_tsreflect = to->to_tsval; sc->sc_ts = to->to_tsecr; sc->sc_tsoff = to->to_tsecr - ticks; sc->sc_flags |= (data & 0x1) ? SCF_SIGNATURE : 0; sc->sc_flags |= ((data >> 1) & 0x1) ? SCF_SACK : 0; sc->sc_requested_s_scale = min((data >> 2) & 0xf, TCP_MAX_WINSHIFT); sc->sc_requested_r_scale = min((data >> 6) & 0xf, TCP_MAX_WINSHIFT); if (sc->sc_requested_s_scale || sc->sc_requested_r_scale) sc->sc_flags |= SCF_WINSCALE; } else sc->sc_flags |= SCF_NOOPT; wnd = sbspace(&so->so_rcv); wnd = imax(wnd, 0); wnd = imin(wnd, TCP_MAXWIN); sc->sc_wnd = wnd; sc->sc_rxmits = 0; sc->sc_peer_mss = tcp_sc_msstab[mss]; V_tcpstat.tcps_sc_recvcookie++; return (sc); } /* * Returns the current number of syncache entries. This number * will probably change before you get around to calling * syncache_pcblist. */ int syncache_pcbcount(void) { INIT_VNET_INET(curvnet); struct syncache_head *sch; int count, i; for (count = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { /* No need to lock for a read. */ sch = &V_tcp_syncache.hashbase[i]; count += sch->sch_length; } return count; } /* * Exports the syncache entries to userland so that netstat can display * them alongside the other sockets. This function is intended to be * called only from tcp_pcblist. * * Due to concurrency on an active system, the number of pcbs exported * may have no relation to max_pcbs. max_pcbs merely indicates the * amount of space the caller allocated for this function to use. */ int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported) { INIT_VNET_INET(curvnet); struct xtcpcb xt; struct syncache *sc; struct syncache_head *sch; int count, error, i; for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { sch = &V_tcp_syncache.hashbase[i]; SCH_LOCK(sch); TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { if (count >= max_pcbs) { SCH_UNLOCK(sch); goto exit; } if (cr_cansee(req->td->td_ucred, sc->sc_cred) != 0) continue; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof(xt); if (sc->sc_inc.inc_isipv6) xt.xt_inp.inp_vflag = INP_IPV6; else xt.xt_inp.inp_vflag = INP_IPV4; bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo)); xt.xt_tp.t_inpcb = &xt.xt_inp; xt.xt_tp.t_state = TCPS_SYN_RECEIVED; xt.xt_socket.xso_protocol = IPPROTO_TCP; xt.xt_socket.xso_len = sizeof (struct xsocket); xt.xt_socket.so_type = SOCK_STREAM; xt.xt_socket.so_state = SS_ISCONNECTING; error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { SCH_UNLOCK(sch); goto exit; } count++; } SCH_UNLOCK(sch); } exit: *pcbs_exported = count; return error; } Index: head/sys/netinet/tcp_timewait.c =================================================================== --- head/sys/netinet/tcp_timewait.c (revision 185087) +++ head/sys/netinet/tcp_timewait.c (revision 185088) @@ -1,664 +1,666 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #include #ifdef INET6 #include #include #include #endif #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif #include #include #include static uma_zone_t tcptw_zone; static int maxtcptw; /* * The timed wait queue contains references to each of the TCP sessions * currently in the TIME_WAIT state. The queue pointers, including the * queue pointers in each tcptw structure, are protected using the global * tcbinfo lock, which must be held over queue iteration and modification. */ +#ifdef VIMAGE_GLOBALS static TAILQ_HEAD(, tcptw) twq_2msl; +int nolocaltimewait; +#endif static void tcp_tw_2msl_reset(struct tcptw *, int); static void tcp_tw_2msl_stop(struct tcptw *); static int tcptw_auto_size(void) { INIT_VNET_INET(curvnet); int halfrange; /* * Max out at half the ephemeral port range so that TIME_WAIT * sockets don't tie up too many ephemeral ports. */ if (V_ipport_lastauto > V_ipport_firstauto) halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2; else halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2; /* Protect against goofy port ranges smaller than 32. */ return (imin(imax(halfrange, 32), maxsockets / 5)); } static int sysctl_maxtcptw(SYSCTL_HANDLER_ARGS) { int error, new; if (maxtcptw == 0) new = tcptw_auto_size(); else new = maxtcptw; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) if (new >= 32) { maxtcptw = new; uma_zone_set_max(tcptw_zone, maxtcptw); } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW, &maxtcptw, 0, sysctl_maxtcptw, "IU", "Maximum number of compressed TCP TIME_WAIT entries"); -static int nolocaltimewait = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_RW, &nolocaltimewait, 0, "Do not create compressed TCP TIME_WAIT entries for local connections"); void tcp_tw_zone_change(void) { if (maxtcptw == 0) uma_zone_set_max(tcptw_zone, tcptw_auto_size()); } void tcp_tw_init(void) { INIT_VNET_INET(curvnet); tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw); if (maxtcptw == 0) uma_zone_set_max(tcptw_zone, tcptw_auto_size()); else uma_zone_set_max(tcptw_zone, maxtcptw); TAILQ_INIT(&V_twq_2msl); } /* * Move a TCP connection into TIME_WAIT state. * tcbinfo is locked. * inp is locked, and is unlocked before returning. */ void tcp_twstart(struct tcpcb *tp) { #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) INIT_VNET_INET(tp->t_vnet); #endif struct tcptw *tw; struct inpcb *inp = tp->t_inpcb; int acknow; struct socket *so; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */ INP_WLOCK_ASSERT(inp); if (V_nolocaltimewait && in_localip(inp->inp_faddr)) { tp = tcp_close(tp); if (tp != NULL) INP_WUNLOCK(inp); return; } tw = uma_zalloc(tcptw_zone, M_NOWAIT); if (tw == NULL) { tw = tcp_tw_2msl_scan(1); if (tw == NULL) { tp = tcp_close(tp); if (tp != NULL) INP_WUNLOCK(inp); return; } } tw->tw_inpcb = inp; /* * Recover last window size sent. */ tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; /* * Set t_recent if timestamps are used on the connection. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { tw->t_recent = tp->ts_recent; tw->ts_offset = tp->ts_offset; } else { tw->t_recent = 0; tw->ts_offset = 0; } tw->snd_nxt = tp->snd_nxt; tw->rcv_nxt = tp->rcv_nxt; tw->iss = tp->iss; tw->irs = tp->irs; tw->t_starttime = tp->t_starttime; tw->tw_time = 0; /* XXX * If this code will * be used for fin-wait-2 state also, then we may need * a ts_recent from the last segment. */ acknow = tp->t_flags & TF_ACKNOW; /* * First, discard tcpcb state, which includes stopping its timers and * freeing it. tcp_discardcb() used to also release the inpcb, but * that work is now done in the caller. * * Note: soisdisconnected() call used to be made in tcp_discardcb(), * and might not be needed here any longer. */ tcp_discardcb(tp); so = inp->inp_socket; soisdisconnected(so); tw->tw_cred = crhold(so->so_cred); SOCK_LOCK(so); tw->tw_so_options = so->so_options; SOCK_UNLOCK(so); if (acknow) tcp_twrespond(tw, TH_ACK); inp->inp_ppcb = tw; inp->inp_vflag |= INP_TIMEWAIT; tcp_tw_2msl_reset(tw, 0); /* * If the inpcb owns the sole reference to the socket, then we can * detach and free the socket as it is not needed in time wait. */ if (inp->inp_vflag & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_twstart: !SS_PROTOREF")); inp->inp_vflag &= ~INP_SOCKREF; INP_WUNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); } else INP_WUNLOCK(inp); } #if 0 /* * The appromixate rate of ISN increase of Microsoft TCP stacks; * the actual rate is slightly higher due to the addition of * random positive increments. * * Most other new OSes use semi-randomized ISN values, so we * do not need to worry about them. */ #define MS_ISN_BYTES_PER_SECOND 250000 /* * Determine if the ISN we will generate has advanced beyond the last * sequence number used by the previous connection. If so, indicate * that it is safe to recycle this tw socket by returning 1. */ int tcp_twrecycleable(struct tcptw *tw) { INIT_VNET_INET(curvnet); tcp_seq new_iss = tw->iss; tcp_seq new_irs = tw->irs; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz); new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz); if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt)) return (1); else return (0); } #endif /* * Returns 1 if the TIME_WAIT state was killed and we should start over, * looking for a pcb in the listen state. Returns 0 otherwise. */ int tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th, struct mbuf *m, int tlen) { #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) INIT_VNET_INET(curvnet); #endif struct tcptw *tw; int thflags; tcp_seq seq; #ifdef INET6 int isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #else const int isipv6 = 0; #endif /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */ INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* * XXXRW: Time wait state for inpcb has been recycled, but inpcb is * still present. This is undesirable, but temporarily necessary * until we work out how to handle inpcb's who's timewait state has * been removed. */ tw = intotw(inp); if (tw == NULL) goto drop; thflags = th->th_flags; /* * NOTE: for FIN_WAIT_2 (to be added later), * must validate sequence number before accepting RST */ /* * If the segment contains RST: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) goto drop; #if 0 /* PAWS not needed at the moment */ /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { if ((thflags & TH_ACK) == 0) goto drop; goto ack; } /* * ts_recent is never updated because we never accept new segments. */ #endif /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) { tcp_twclose(tw, 0); return (1); } /* * Drop the the segment if it does not contain an ACK. */ if ((thflags & TH_ACK) == 0) goto drop; /* * Reset the 2MSL timer if this is a duplicate FIN. */ if (thflags & TH_FIN) { seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); if (seq + 1 == tw->rcv_nxt) tcp_tw_2msl_reset(tw, 1); } /* * Acknowledge the segment if it has data or is not a duplicate ACK. */ if (thflags != TH_ACK || tlen != 0 || th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) tcp_twrespond(tw, TH_ACK); goto drop; /* * Generate a RST, dropping incoming segment. * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ if (m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { #ifdef INET6 struct ip6_hdr *ip6; /* IPv6 anycast check is done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; #endif } else { struct ip *ip; ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } if (thflags & TH_ACK) { tcp_respond(NULL, mtod(m, void *), th, m, 0, th->th_ack, TH_RST); } else { seq = th->th_seq + (thflags & TH_SYN ? 1 : 0); tcp_respond(NULL, mtod(m, void *), th, m, seq, 0, TH_RST|TH_ACK); } INP_WUNLOCK(inp); return (0); drop: INP_WUNLOCK(inp); m_freem(m); return (0); } void tcp_twclose(struct tcptw *tw, int reuse) { INIT_VNET_INET(curvnet); struct socket *so; struct inpcb *inp; /* * At this point, we are in one of two situations: * * (1) We have no socket, just an inpcb<->twtcp pair. We can free * all state. * * (2) We have a socket -- if we own a reference, release it and * notify the socket layer. */ inp = tw->tw_inpcb; KASSERT((inp->inp_vflag & INP_TIMEWAIT), ("tcp_twclose: !timewait")); KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_stop(). */ INP_WLOCK_ASSERT(inp); tw->tw_inpcb = NULL; tcp_tw_2msl_stop(tw); inp->inp_ppcb = NULL; in_pcbdrop(inp); so = inp->inp_socket; if (so != NULL) { /* * If there's a socket, handle two cases: first, we own a * strong reference, which we will now release, or we don't * in which case another reference exists (XXXRW: think * about this more), and we don't need to take action. */ if (inp->inp_vflag & INP_SOCKREF) { inp->inp_vflag &= ~INP_SOCKREF; INP_WUNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); KASSERT(so->so_state & SS_PROTOREF, ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF")); so->so_state &= ~SS_PROTOREF; sofree(so); } else { /* * If we don't own the only reference, the socket and * inpcb need to be left around to be handled by * tcp_usr_detach() later. */ INP_WUNLOCK(inp); } } else { #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) in6_pcbfree(inp); else #endif in_pcbfree(inp); } V_tcpstat.tcps_closed++; crfree(tw->tw_cred); tw->tw_cred = NULL; if (reuse) return; uma_zfree(tcptw_zone, tw); } int tcp_twrespond(struct tcptw *tw, int flags) { INIT_VNET_INET(curvnet); struct inpcb *inp = tw->tw_inpcb; struct tcphdr *th; struct mbuf *m; struct ip *ip = NULL; u_int hdrlen, optlen; int error; struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6 = inp->inp_inc.inc_isipv6; #endif INP_WLOCK_ASSERT(inp); m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_data += max_linkhdr; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif #ifdef INET6 if (isipv6) { hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(inp, ip6, th); } else #endif { hdrlen = sizeof(struct tcpiphdr); ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, ip, th); } to.to_flags = 0; /* * Send a timestamp and echo-reply if both our side and our peer * have sent timestamps in our SYN's and this is not a RST. */ if (tw->t_recent && flags == TH_ACK) { to.to_flags |= TOF_TS; to.to_tsval = ticks + tw->ts_offset; to.to_tsecr = tw->t_recent; } optlen = tcp_addoptions(&to, (u_char *)(th + 1)); m->m_len = hdrlen + optlen; m->m_pkthdr.len = m->m_len; KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); th->th_seq = htonl(tw->snd_nxt); th->th_ack = htonl(tw->rcv_nxt); th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; th->th_flags = flags; th->th_win = htons(tw->last_win); #ifdef INET6 if (isipv6) { th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), sizeof(struct tcphdr) + optlen); ip6->ip6_hlim = in6_selecthlim(inp, NULL); error = ip6_output(m, inp->in6p_outputopts, NULL, (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); } else #endif { th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); ip->ip_len = m->m_pkthdr.len; if (V_path_mtu_discovery) ip->ip_off |= IP_DF; error = ip_output(m, inp->inp_options, NULL, ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, inp); } if (flags & TH_ACK) V_tcpstat.tcps_sndacks++; else V_tcpstat.tcps_sndctrl++; V_tcpstat.tcps_sndtotal++; return (error); } static void tcp_tw_2msl_reset(struct tcptw *tw, int rearm) { INIT_VNET_INET(curvnet); INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tw->tw_inpcb); if (rearm) TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); tw->tw_time = ticks + 2 * tcp_msl; TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl); } static void tcp_tw_2msl_stop(struct tcptw *tw) { INIT_VNET_INET(curvnet); INP_INFO_WLOCK_ASSERT(&V_tcbinfo); TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); } struct tcptw * tcp_tw_2msl_scan(int reuse) { INIT_VNET_INET(curvnet); struct tcptw *tw; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); for (;;) { tw = TAILQ_FIRST(&V_twq_2msl); if (tw == NULL || (!reuse && tw->tw_time > ticks)) break; INP_WLOCK(tw->tw_inpcb); tcp_twclose(tw, reuse); if (reuse) return (tw); } return (NULL); } Index: head/sys/netinet/tcp_var.h =================================================================== --- head/sys/netinet/tcp_var.h (revision 185087) +++ head/sys/netinet/tcp_var.h (revision 185088) @@ -1,604 +1,623 @@ /*- * Copyright (c) 1982, 1986, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 * $FreeBSD$ */ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #include /* * Kernel variables for tcp. */ extern int tcp_do_rfc1323; /* TCP segment queue entry */ struct tseg_qent { LIST_ENTRY(tseg_qent) tqe_q; int tqe_len; /* TCP segment data length */ struct tcphdr *tqe_th; /* a pointer to tcp header */ struct mbuf *tqe_m; /* mbuf contains packet */ }; LIST_HEAD(tsegqe_head, tseg_qent); extern int tcp_reass_qsize; extern struct uma_zone *tcp_reass_zone; struct sackblk { tcp_seq start; /* start seq no. of sack block */ tcp_seq end; /* end seq no. */ }; struct sackhole { tcp_seq start; /* start seq no. of hole */ tcp_seq end; /* end seq no. */ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ }; struct sackhint { struct sackhole *nexthole; int sack_bytes_rexmit; }; struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; }; #define tcp6cb tcpcb /* for KAME src sync over BSD*'s */ /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 #define ND6_HINT(tp) \ do { \ if ((tp) && (tp)->t_inpcb && \ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ nd6_nud_hint(NULL, NULL, 0); \ } while (0) #else #define ND6_HINT(tp) #endif /* * Tcp control block, one per tcp; fields: * Organized for 16 byte cacheline efficiency. */ struct tcpcb { struct tsegqe_head t_segq; /* segment reassembly queue */ int t_segqlen; /* segment reassembly queue length */ int t_dupacks; /* consecutive dup acks recd */ struct tcp_timer *t_timers; /* All the TCP timers in one struct */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ int t_state; /* state of this connection */ u_int t_flags; #define TF_ACKNOW 0x000001 /* ack peer immediately */ #define TF_DELACK 0x000002 /* ack, but try to delay it */ #define TF_NODELAY 0x000004 /* don't delay packets to coalesce */ #define TF_NOOPT 0x000008 /* don't use tcp options */ #define TF_SENTFIN 0x000010 /* have sent FIN */ #define TF_REQ_SCALE 0x000020 /* have/will request window scaling */ #define TF_RCVD_SCALE 0x000040 /* other side has requested scaling */ #define TF_REQ_TSTMP 0x000080 /* have/will request timestamps */ #define TF_RCVD_TSTMP 0x000100 /* a timestamp was received in SYN */ #define TF_SACK_PERMIT 0x000200 /* other side said I could SACK */ #define TF_NEEDSYN 0x000400 /* send SYN (implicit state) */ #define TF_NEEDFIN 0x000800 /* send FIN (implicit state) */ #define TF_NOPUSH 0x001000 /* don't push */ #define TF_MORETOCOME 0x010000 /* More data to be appended to sock */ #define TF_LQ_OVERFLOW 0x020000 /* listen queue overflow */ #define TF_LASTIDLE 0x040000 /* connection was previously idle */ #define TF_RXWIN0SENT 0x080000 /* sent a receiver win 0 in response */ #define TF_FASTRECOVERY 0x100000 /* in NewReno Fast Recovery */ #define TF_WASFRECOVERY 0x200000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */ #define TF_FORCEDATA 0x800000 /* force out a byte */ #define TF_TSO 0x1000000 /* TSO enabled on this connection */ #define TF_TOE 0x2000000 /* this connection is offloaded */ #define TF_ECN_PERMIT 0x4000000 /* connection ECN-ready */ #define TF_ECN_SND_CWR 0x8000000 /* ECN CWR in queue */ #define TF_ECN_SND_ECE 0x10000000 /* ECN ECE in queue */ tcp_seq snd_una; /* send unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; * used to recognize retransmits */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ tcp_seq snd_wl1; /* window update seg seq number */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq iss; /* initial send sequence number */ tcp_seq irs; /* initial receive sequence number */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_adv; /* advertised window */ u_long rcv_wnd; /* receive window */ tcp_seq rcv_up; /* receive urgent pointer */ u_long snd_wnd; /* send window */ u_long snd_cwnd; /* congestion-controlled window */ u_long snd_bwnd; /* bandwidth-controlled window */ u_long snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ u_long snd_bandwidth; /* calculated bandwidth or 0 */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ u_int t_maxopd; /* mss plus options */ u_long t_rcvtime; /* inactivity time */ u_long t_starttime; /* time connection was established */ int t_rtttime; /* round trip time */ tcp_seq t_rtseq; /* sequence number being timed */ int t_bw_rtttime; /* used for bandwidth calculation */ tcp_seq t_bw_rtseq; /* used for bandwidth calculation */ int t_rxtcur; /* current retransmit value (ticks) */ u_int t_maxseg; /* maximum segment size */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rttmin; /* minimum rtt allowed */ u_int t_rttbest; /* best rtt we've seen */ u_long t_rttupdated; /* number of times rtt sampled */ u_long max_sndwnd; /* largest window peer has offered */ int t_softerror; /* possible error not yet reported */ /* out-of-band data */ char t_oobflags; /* have some */ char t_iobc; /* input character */ #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 /* RFC 1323 variables */ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char request_r_scale; /* pending window scaling */ u_int32_t ts_recent; /* timestamp echo data */ u_long ts_recent_age; /* when last updated */ u_int32_t ts_offset; /* our timestamp offset */ tcp_seq last_ack_sent; /* experimental */ u_long snd_cwnd_prev; /* cwnd prior to retransmit */ u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ u_long t_badrxtwin; /* window for retransmit recovery */ u_char snd_limited; /* segments limited transmitted */ /* SACK related state */ int snd_numholes; /* number of holes seen by sender */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ int rcv_numsacks; /* # distinct sack blks present */ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ tcp_seq sack_newdata; /* New data xmitted in this recovery episode starts at this seq number */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ void *t_pspare[3]; /* toe usrreqs / toepcb * / congestion algo / vimage / 1 general use */ struct toe_usrreqs *t_tu; /* offload operations vector */ void *t_toe; /* TOE pcb pointer */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(tp) tp->t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(tp) tp->t_flags &= ~TF_FASTRECOVERY #ifdef TCP_SIGNATURE /* * Defines which are needed by the xform_tcp module and tcp_[in|out]put * for SADB verification and lookup. */ #define TCP_SIGLEN 16 /* length of computed digest in bytes */ #define TCP_KEYLEN_MIN 1 /* minimum length of TCP-MD5 key */ #define TCP_KEYLEN_MAX 80 /* maximum length of TCP-MD5 key */ /* * Only a single SA per host may be specified at this time. An SPI is * needed in order for the KEY_ALLOCSA() lookup to work. */ #define TCP_SIG_SPI 0x1000 #endif /* TCP_SIGNATURE */ /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. * It's basically used to reduce the number of parameters * to tcp_dooptions and tcp_addoptions. * The binary order of the to_flags is relevant for packing of the * options in tcp_addoptions. */ struct tcpopt { u_long to_flags; /* which options are present */ #define TOF_MSS 0x0001 /* maximum segment size */ #define TOF_SCALE 0x0002 /* window scaling */ #define TOF_SACKPERM 0x0004 /* SACK permitted */ #define TOF_TS 0x0010 /* timestamp */ #define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ #define TOF_SACK 0x0080 /* Peer sent SACK option */ #define TOF_MAXOPT 0x0100 u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ u_char *to_sacks; /* pointer to the first SACK blocks */ u_char *to_signature; /* pointer to the TCP-MD5 signature */ }; /* * Flags for tcp_dooptions. */ #define TO_SYN 0x01 /* parse SYN-only options */ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ u_long rmx_mtu; /* MTU for this path */ u_long rmx_ssthresh; /* outbound gateway buffer limit */ u_long rmx_rtt; /* estimated round trip time */ u_long rmx_rttvar; /* estimated rtt variance */ u_long rmx_bandwidth; /* estimated bandwidth */ u_long rmx_cwnd; /* congestion window */ u_long rmx_sendpipe; /* outbound delay-bandwidth product */ u_long rmx_recvpipe; /* inbound delay-bandwidth product */ }; #ifndef _NETINET_IN_PCB_H_ struct in_conninfo; #endif /* _NETINET_IN_PCB_H_ */ struct tcptw { struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */ tcp_seq snd_nxt; tcp_seq rcv_nxt; tcp_seq iss; tcp_seq irs; u_short last_win; /* cached window value */ u_short tw_so_options; /* copy of so_options */ struct ucred *tw_cred; /* user credentials */ u_long t_recent; u_int32_t ts_offset; /* our timestamp offset */ u_long t_starttime; int tw_time; TAILQ_ENTRY(tcptw) tw_2msl; }; #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) #define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb) #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) /* * The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). * With these scales, srtt has 3 bits to the right of the binary point, * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the * binary point, and is smoothed with an ALPHA of 0.75. */ #define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ #define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ #define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ #define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ /* * The initial retransmission should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). * This version of the macro adapted from a paper by Lawrence * Brakmo and Larry Peterson which outlines a problem caused * by insufficient precision in the original implementation, * which results in inappropriately large RTO values for very * fast networks. */ #define TCP_REXMTVAL(tp) \ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* * TCP statistics. * Many of these should be kept per connection, * but that's inconvenient at the moment. */ struct tcpstat { u_long tcps_connattempt; /* connections initiated */ u_long tcps_accepts; /* connections accepted */ u_long tcps_connects; /* connections established */ u_long tcps_drops; /* connections dropped */ u_long tcps_conndrops; /* embryonic connections dropped */ u_long tcps_minmssdrops; /* average minmss too low drops */ u_long tcps_closed; /* conn. closed (includes drops) */ u_long tcps_segstimed; /* segs where we tried to get rtt */ u_long tcps_rttupdated; /* times we succeeded */ u_long tcps_delack; /* delayed acks sent */ u_long tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ u_long tcps_rexmttimeo; /* retransmit timeouts */ u_long tcps_persisttimeo; /* persist timeouts */ u_long tcps_keeptimeo; /* keepalive timeouts */ u_long tcps_keepprobe; /* keepalive probes sent */ u_long tcps_keepdrops; /* connections dropped in keepalive */ u_long tcps_sndtotal; /* total packets sent */ u_long tcps_sndpack; /* data packets sent */ u_long tcps_sndbyte; /* data bytes sent */ u_long tcps_sndrexmitpack; /* data packets retransmitted */ u_long tcps_sndrexmitbyte; /* data bytes retransmitted */ u_long tcps_sndrexmitbad; /* unnecessary packet retransmissions */ u_long tcps_sndacks; /* ack-only packets sent */ u_long tcps_sndprobe; /* window probes sent */ u_long tcps_sndurg; /* packets sent with URG only */ u_long tcps_sndwinup; /* window update-only packets sent */ u_long tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ u_long tcps_rcvtotal; /* total packets received */ u_long tcps_rcvpack; /* packets received in sequence */ u_long tcps_rcvbyte; /* bytes received in sequence */ u_long tcps_rcvbadsum; /* packets received with ccksum errs */ u_long tcps_rcvbadoff; /* packets received with bad offset */ u_long tcps_rcvmemdrop; /* packets dropped for lack of memory */ u_long tcps_rcvshort; /* packets received too short */ u_long tcps_rcvduppack; /* duplicate-only packets received */ u_long tcps_rcvdupbyte; /* duplicate-only bytes received */ u_long tcps_rcvpartduppack; /* packets with some duplicate data */ u_long tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ u_long tcps_rcvoopack; /* out-of-order packets received */ u_long tcps_rcvoobyte; /* out-of-order bytes received */ u_long tcps_rcvpackafterwin; /* packets with data after window */ u_long tcps_rcvbyteafterwin; /* bytes rcvd after window */ u_long tcps_rcvafterclose; /* packets rcvd after "close" */ u_long tcps_rcvwinprobe; /* rcvd window probe packets */ u_long tcps_rcvdupack; /* rcvd duplicate acks */ u_long tcps_rcvacktoomuch; /* rcvd acks for unsent data */ u_long tcps_rcvackpack; /* rcvd ack packets */ u_long tcps_rcvackbyte; /* bytes acked by rcvd acks */ u_long tcps_rcvwinupd; /* rcvd window update packets */ u_long tcps_pawsdrop; /* segments dropped due to PAWS */ u_long tcps_predack; /* times hdr predict ok for acks */ u_long tcps_preddat; /* times hdr predict ok for data pkts */ u_long tcps_pcbcachemiss; u_long tcps_cachedrtt; /* times cached RTT in route updated */ u_long tcps_cachedrttvar; /* times cached rttvar updated */ u_long tcps_cachedssthresh; /* times cached ssthresh updated */ u_long tcps_usedrtt; /* times RTT initialized from route */ u_long tcps_usedrttvar; /* times RTTVAR initialized from rt */ u_long tcps_usedssthresh; /* times ssthresh initialized from rt*/ u_long tcps_persistdrop; /* timeout in persist state */ u_long tcps_badsyn; /* bogus SYN, e.g. premature ACK */ u_long tcps_mturesent; /* resends due to MTU discovery */ u_long tcps_listendrop; /* listen queue overflows */ u_long tcps_badrst; /* ignored RSTs in the window */ u_long tcps_sc_added; /* entry added to syncache */ u_long tcps_sc_retransmitted; /* syncache entry was retransmitted */ u_long tcps_sc_dupsyn; /* duplicate SYN packet */ u_long tcps_sc_dropped; /* could not reply to packet */ u_long tcps_sc_completed; /* successful extraction of entry */ u_long tcps_sc_bucketoverflow; /* syncache per-bucket limit hit */ u_long tcps_sc_cacheoverflow; /* syncache cache limit hit */ u_long tcps_sc_reset; /* RST removed entry from syncache */ u_long tcps_sc_stale; /* timed out or listen socket gone */ u_long tcps_sc_aborted; /* syncache entry aborted */ u_long tcps_sc_badack; /* removed due to bad ACK */ u_long tcps_sc_unreach; /* ICMP unreachable received */ u_long tcps_sc_zonefail; /* zalloc() failed */ u_long tcps_sc_sendcookie; /* SYN cookie sent */ u_long tcps_sc_recvcookie; /* SYN cookie received */ u_long tcps_hc_added; /* entry added to hostcache */ u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */ u_long tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */ /* SACK related stats */ u_long tcps_sack_recovery_episode; /* SACK recovery episodes */ u_long tcps_sack_rexmits; /* SACK rexmit segments */ u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */ u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */ u_long tcps_sack_sboverflow; /* times scoreboard overflowed */ /* ECN related stats */ u_long tcps_ecn_ce; /* ECN Congestion Experienced */ u_long tcps_ecn_ect0; /* ECN Capable Transport */ u_long tcps_ecn_ect1; /* ECN Capable Transport */ u_long tcps_ecn_shs; /* ECN successful handshakes */ u_long tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ }; /* * TCB structure exported to user-land via sysctl(3). * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcpcb { size_t xt_len; struct inpcb xt_inp; struct tcpcb xt_tp; struct xsocket xt_socket; u_quad_t xt_alignment_hack; }; #endif /* * Names for TCP sysctl objects */ #define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ #define TCPCTL_MSSDFLT 3 /* MSS default */ #define TCPCTL_STATS 4 /* statistics (read-only) */ #define TCPCTL_RTTDFLT 5 /* default RTT estimate */ #define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ #define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_MAXID 16 #define TCPCTL_FINWAIT2_TIMEOUT 17 #define TCPCTL_NAMES { \ { 0, 0 }, \ { "rfc1323", CTLTYPE_INT }, \ { "mssdflt", CTLTYPE_INT }, \ { "stats", CTLTYPE_STRUCT }, \ { "rttdflt", CTLTYPE_INT }, \ { "keepidle", CTLTYPE_INT }, \ { "keepintvl", CTLTYPE_INT }, \ { "sendspace", CTLTYPE_INT }, \ { "recvspace", CTLTYPE_INT }, \ { "keepinit", CTLTYPE_INT }, \ { "pcblist", CTLTYPE_STRUCT }, \ { "delacktime", CTLTYPE_INT }, \ { "v6mssdflt", CTLTYPE_INT }, \ { "maxid", CTLTYPE_INT }, \ } #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); SYSCTL_DECL(_net_inet_tcp_sack); MALLOC_DECLARE(M_TCPLOG); #endif extern struct inpcbhead tcb; /* head of queue of active tcpcb's */ extern struct inpcbinfo tcbinfo; extern struct tcpstat tcpstat; /* tcp statistics */ extern int tcp_log_in_vain; extern int tcp_mssdflt; /* XXX */ extern int tcp_minmss; extern int tcp_delack_enabled; extern int tcp_do_newreno; extern int path_mtu_discovery; extern int ss_fltsz; extern int ss_fltsz_local; +extern int blackhole; +extern int drop_synfin; +extern int tcp_do_rfc3042; +extern int tcp_do_rfc3390; +extern int tcp_insecure_rst; +extern int tcp_do_autorcvbuf; +extern int tcp_autorcvbuf_inc; +extern int tcp_autorcvbuf_max; + +extern int tcp_do_tso; +extern int tcp_do_autosndbuf; +extern int tcp_autosndbuf_inc; +extern int tcp_autosndbuf_max; + +extern int nolocaltimewait; + extern int tcp_do_sack; /* SACK enabled/disabled */ +extern int tcp_sack_maxholes; +extern int tcp_sack_globalmaxholes; +extern int tcp_sack_globalholes; extern int tcp_sc_rst_sock_fail; /* RST on sock alloc failure */ extern int tcp_do_ecn; /* TCP ECN enabled/disabled */ extern int tcp_ecn_maxretries; int tcp_addoptions(struct tcpopt *, u_char *); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); void tcp_twstart(struct tcpcb *); #if 0 int tcp_twrecycleable(struct tcptw *tw); #endif void tcp_twclose(struct tcptw *_tw, int _reuse); void tcp_ctlinput(int, struct sockaddr *, void *); int tcp_ctloutput(struct socket *, struct sockopt *); struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void); void tcp_fasttimo(void); void tcp_init(void); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, const void *); int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); void tcp_reass_init(void); void tcp_input(struct mbuf *, int); u_long tcp_maxmtu(struct in_conninfo *, int *); u_long tcp_maxmtu6(struct in_conninfo *, int *); void tcp_mss_update(struct tcpcb *, int, struct hc_metrics_lite *, int *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct inpcb * tcp_drop_syn_sent(struct inpcb *, int); struct inpcb * tcp_mtudisc(struct inpcb *, int); struct tcpcb * tcp_newtcpcb(struct inpcb *); int tcp_output(struct tcpcb *); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); void tcp_tw_init(void); void tcp_tw_zone_change(void); int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); int tcp_twrespond(struct tcptw *, int); void tcp_setpersist(struct tcpcb *); #ifdef TCP_SIGNATURE int tcp_signature_compute(struct mbuf *, int, int, int, u_char *, u_int); #endif void tcp_slowtimo(void); struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, void *, void *); void tcp_timer_activate(struct tcpcb *, int, u_int); int tcp_timer_active(struct tcpcb *, int); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); u_long tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, u_long); void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); extern struct pr_usrreqs tcp_usrreqs; extern u_long tcp_sendspace; extern u_long tcp_recvspace; tcp_seq tcp_new_isn(struct tcpcb *); void tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); int tcp_newreno(struct tcpcb *, struct tcphdr *); u_long tcp_seq_subtract(u_long, u_long ); #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */ Index: head/sys/netinet/udp_usrreq.c =================================================================== --- head/sys/netinet/udp_usrreq.c (revision 185087) +++ head/sys/netinet/udp_usrreq.c (revision 185088) @@ -1,1282 +1,1289 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 #include #endif #include #include #ifdef IPSEC #include #endif #include #include /* * UDP protocol implementation. * Per RFC 768, August, 1980. */ +#ifdef VIMAGE_GLOBALS +int udp_blackhole; +#endif + /* * BSD 4.2 defaulted the udp checksum to be off. Turning off udp checksums * removes the only data integrity mechanism for packets and malformed * packets that would otherwise be discarded due to bad checksums, and may * cause problems (especially for NFS data blocks). */ static int udp_cksum = 1; SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, &udp_cksum, 0, "compute udp checksum"); int udp_log_in_vain = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, &udp_log_in_vain, 0, "Log all incoming UDP packets"); -int udp_blackhole = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, &udp_blackhole, 0, "Do not send port unreachables for refused connects"); u_long udp_sendspace = 9216; /* really max datagram size */ /* 40 1K datagrams */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); u_long udp_recvspace = 40 * (1024 + #ifdef INET6 sizeof(struct sockaddr_in6) #else sizeof(struct sockaddr_in) #endif ); SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); +#ifdef VIMAGE_GLOBALS struct inpcbhead udb; /* from udp_var.h */ struct inpcbinfo udbinfo; +struct udpstat udpstat; /* from udp_var.h */ +#endif #ifndef UDBHASHSIZE #define UDBHASHSIZE 128 #endif -struct udpstat udpstat; /* from udp_var.h */ SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); static void udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); static void udp_zone_change(void *tag) { uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); } static int udp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp; inp = mem; INP_LOCK_INIT(inp, "inp", "udpinp"); return (0); } void udp_init(void) { INIT_VNET_INET(curvnet); + + V_udp_blackhole = 0; INP_INFO_LOCK_INIT(&V_udbinfo, "udp"); LIST_INIT(&V_udb); V_udbinfo.ipi_listhead = &V_udb; V_udbinfo.ipi_hashbase = hashinit(UDBHASHSIZE, M_PCB, &V_udbinfo.ipi_hashmask); V_udbinfo.ipi_porthashbase = hashinit(UDBHASHSIZE, M_PCB, &V_udbinfo.ipi_porthashmask); V_udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL, NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } /* * Subroutine of udp_input(), which appends the provided mbuf chain to the * passed pcb/socket. The caller must provide a sockaddr_in via udp_in that * contains the source address. If the socket ends up being an IPv6 socket, * udp_append() will convert to a sockaddr_in6 before passing the address * into the socket code. */ static void udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, struct sockaddr_in *udp_in) { struct sockaddr *append_sa; struct socket *so; struct mbuf *opts = 0; #ifdef INET6 struct sockaddr_in6 udp_in6; #endif INP_RLOCK_ASSERT(inp); #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec4_in_reject(n, inp)) { INIT_VNET_IPSEC(curvnet); m_freem(n); V_ipsec4stat.in_polvio++; return; } #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return; } #endif if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6) (void)ip6_savecontrol_v4(inp, n, &opts, NULL); else #endif ip_savecontrol(inp, &opts, ip, n); } #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { bzero(&udp_in6, sizeof(udp_in6)); udp_in6.sin6_len = sizeof(udp_in6); udp_in6.sin6_family = AF_INET6; in6_sin_2_v4mapsin6(udp_in, &udp_in6); append_sa = (struct sockaddr *)&udp_in6; } else #endif append_sa = (struct sockaddr *)udp_in; m_adj(n, off); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) { INIT_VNET_INET(so->so_vnet); SOCKBUF_UNLOCK(&so->so_rcv); m_freem(n); if (opts) m_freem(opts); V_udpstat.udps_fullsock++; } else sorwakeup_locked(so); } void udp_input(struct mbuf *m, int off) { INIT_VNET_INET(curvnet); int iphlen = off; struct ip *ip; struct udphdr *uh; struct ifnet *ifp; struct inpcb *inp; int len; struct ip save_ip; struct sockaddr_in udp_in; #ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag; #endif ifp = m->m_pkthdr.rcvif; V_udpstat.udps_ipackets++; /* * Strip IP options, if any; should skip this, make available to * user, and use on returned packets, but we don't yet have a way to * check the checksum with options still present. */ if (iphlen > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); iphlen = sizeof(struct ip); } /* * Get IP and UDP header together in first mbuf. */ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) { V_udpstat.udps_hdrops++; return; } ip = mtod(m, struct ip *); } uh = (struct udphdr *)((caddr_t)ip + iphlen); /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; /* * Construct sockaddr format source address. Stuff source address * and datagram in user buffer. */ bzero(&udp_in, sizeof(udp_in)); udp_in.sin_len = sizeof(udp_in); udp_in.sin_family = AF_INET; udp_in.sin_port = uh->uh_sport; udp_in.sin_addr = ip->ip_src; /* * Make mbuf data length reflect UDP length. If not enough data to * reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); if (ip->ip_len != len) { if (len > ip->ip_len || len < sizeof(struct udphdr)) { V_udpstat.udps_badlen++; goto badunlocked; } m_adj(m, len - ip->ip_len); /* ip->ip_len = len; */ } /* * Save a copy of the IP header in case we want restore it for * sending an ICMP error message in response. */ if (!V_udp_blackhole) save_ip = *ip; else memset(&save_ip, 0, sizeof(save_ip)); /* * Checksum extended UDP header and data. */ if (uh->uh_sum) { u_short uh_sum; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_UDP)); uh_sum ^= 0xffff; } else { char b[9]; bcopy(((struct ipovly *)ip)->ih_x1, b, 9); bzero(((struct ipovly *)ip)->ih_x1, 9); ((struct ipovly *)ip)->ih_len = uh->uh_ulen; uh_sum = in_cksum(m, len + sizeof (struct ip)); bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); } if (uh_sum) { V_udpstat.udps_badsum++; m_freem(m); return; } } else V_udpstat.udps_nosum++; #ifdef IPFIREWALL_FORWARD /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (fwd_tag != NULL) { struct sockaddr_in *next_hop; /* * Do the hack. */ next_hop = (struct sockaddr_in *)(fwd_tag + 1); ip->ip_dst = next_hop->sin_addr; uh->uh_dport = ntohs(next_hop->sin_port); /* * Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); } #endif INP_INFO_RLOCK(&V_udbinfo); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, ifp)) { struct inpcb *last; struct ip_moptions *imo; last = NULL; LIST_FOREACH(inp, &V_udb, inp_list) { if (inp->inp_lport != uh->uh_dport) continue; #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; /* * XXX: Do not check source port of incoming datagram * unless inp_connect() has been called to bind the * fport part of the 4-tuple; the source could be * trying to talk to us with an ephemeral port. */ if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) continue; INP_RLOCK(inp); /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ imo = inp->inp_moptions; if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL) { struct sockaddr_in sin; struct in_msource *ims; int blocked, mode; size_t idx; bzero(&sin, sizeof(struct sockaddr_in)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr = ip->ip_dst; blocked = 0; idx = imo_match_group(imo, ifp, (struct sockaddr *)&sin); if (idx == -1) { /* * No group membership for this socket. * Do not bump udps_noportbcast, as * this will happen further down. */ blocked++; } else { /* * Check for a multicast source filter * entry on this socket for this group. * MCAST_EXCLUDE is the default * behaviour. It means default accept; * entries, if present, denote sources * to be excluded from delivery. */ ims = imo_match_source(imo, idx, (struct sockaddr *)&udp_in); mode = imo->imo_mfilters[idx].imf_fmode; if ((ims != NULL && mode == MCAST_EXCLUDE) || (ims == NULL && mode == MCAST_INCLUDE)) { #ifdef DIAGNOSTIC if (bootverbose) { printf("%s: blocked by" " source filter\n", __func__); } #endif V_udpstat.udps_filtermcast++; blocked++; } } if (blocked != 0) { INP_RUNLOCK(inp); continue; } } if (last != NULL) { struct mbuf *n; n = m_copy(m, 0, M_COPYALL); if (n != NULL) udp_append(last, ip, n, iphlen + sizeof(struct udphdr), &udp_in); INP_RUNLOCK(last); } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((last->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ V_udpstat.udps_noportbcast++; goto badheadlocked; } udp_append(last, ip, m, iphlen + sizeof(struct udphdr), &udp_in); INP_RUNLOCK(last); INP_INFO_RUNLOCK(&V_udbinfo); return; } /* * Locate pcb for datagram. */ inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, 1, ifp); if (inp == NULL) { if (udp_log_in_vain) { char buf[4*sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_dst)); log(LOG_INFO, "Connection attempt to UDP %s:%d from %s:%d\n", buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), ntohs(uh->uh_sport)); } V_udpstat.udps_noport++; if (m->m_flags & (M_BCAST | M_MCAST)) { V_udpstat.udps_noportbcast++; goto badheadlocked; } if (V_udp_blackhole) goto badheadlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badheadlocked; *ip = save_ip; ip->ip_len += iphlen; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); INP_INFO_RUNLOCK(&V_udbinfo); return; } /* * Check the minimum TTL for socket. */ INP_RLOCK(inp); INP_INFO_RUNLOCK(&V_udbinfo); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { INP_RUNLOCK(inp); goto badunlocked; } udp_append(inp, ip, m, iphlen + sizeof(struct udphdr), &udp_in); INP_RUNLOCK(inp); return; badheadlocked: if (inp) INP_RUNLOCK(inp); INP_INFO_RUNLOCK(&V_udbinfo); badunlocked: m_freem(m); } /* * Notify a udp user of an asynchronous error; just wake up so that they can * collect error status. */ struct inpcb * udp_notify(struct inpcb *inp, int errno) { /* * While udp_ctlinput() always calls udp_notify() with a read lock * when invoking it directly, in_pcbnotifyall() currently uses write * locks due to sharing code with TCP. For now, accept either a read * or a write lock, but a read lock is sufficient. */ INP_LOCK_ASSERT(inp); inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); sowwakeup(inp->inp_socket); return (inp); } void udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { INIT_VNET_INET(curvnet); struct ip *ip = vip; struct udphdr *uh; struct in_addr faddr; struct inpcb *inp; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; /* * Redirects don't need to be handled up here. */ if (PRC_IS_REDIRECT(cmd)) return; /* * Hostdead is ugly because it goes linearly through all PCBs. * * XXX: We never get this from ICMP, otherwise it makes an excellent * DoS attack on machines with many connections. */ if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_RLOCK(&V_udbinfo); inp = in_pcblookup_hash(&V_udbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, 0, NULL); if (inp != NULL) { INP_RLOCK(inp); if (inp->inp_socket != NULL) { udp_notify(inp, inetctlerrmap[cmd]); } INP_RUNLOCK(inp); } INP_INFO_RUNLOCK(&V_udbinfo); } else in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd], udp_notify); } static int udp_pcblist(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET(curvnet); int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_udbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return (0); } if (req->newptr != 0) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&V_udbinfo); gencnt = V_udbinfo.ipi_gencnt; n = V_udbinfo.ipi_count; INP_INFO_RUNLOCK(&V_udbinfo); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xinpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return (ENOMEM); INP_INFO_RLOCK(&V_udbinfo); for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) inp_list[i++] = inp; INP_RUNLOCK(inp); } INP_INFO_RUNLOCK(&V_udbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); xi.xi_inp.inp_gencnt = inp->inp_gencnt; INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ INP_INFO_RLOCK(&V_udbinfo); xig.xig_gen = V_udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_udbinfo.ipi_count; INP_INFO_RUNLOCK(&V_udbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); static int udp_getcred(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET(curvnet); struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); INP_INFO_RLOCK(&V_udbinfo); inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); if (inp != NULL) { INP_RLOCK(inp); INP_INFO_RUNLOCK(&V_udbinfo); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else { INP_INFO_RUNLOCK(&V_udbinfo); error = ENOENT; } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); static int udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { INIT_VNET_INET(inp->inp_vnet); struct udpiphdr *ui; int len = m->m_pkthdr.len; struct in_addr faddr, laddr; struct cmsghdr *cm; struct sockaddr_in *sin, src; int error = 0; int ipflags; u_short fport, lport; int unlock_udbinfo; /* * udp_output() may need to temporarily bind or connect the current * inpcb. As such, we don't know up front whether we will need the * pcbinfo lock or not. Do any work to decide what is needed up * front before acquiring any locks. */ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { if (control) m_freem(control); m_freem(m); return (EMSGSIZE); } src.sin_family = 0; if (control != NULL) { /* * XXX: Currently, we assume all the optional information is * stored in a single mbuf. */ if (control->m_next) { m_freem(control); m_freem(m); return (EINVAL); } for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { cm = mtod(control, struct cmsghdr *); if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) { error = EINVAL; break; } if (cm->cmsg_level != IPPROTO_IP) continue; switch (cm->cmsg_type) { case IP_SENDSRCADDR: if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr))) { error = EINVAL; break; } bzero(&src, sizeof(src)); src.sin_family = AF_INET; src.sin_len = sizeof(src); src.sin_port = inp->inp_lport; src.sin_addr = *(struct in_addr *)CMSG_DATA(cm); break; default: error = ENOPROTOOPT; break; } if (error) break; } m_freem(control); } if (error) { m_freem(m); return (error); } /* * Depending on whether or not the application has bound or connected * the socket, we may have to do varying levels of work. The optimal * case is for a connected UDP socket, as a global lock isn't * required at all. * * In order to decide which we need, we require stability of the * inpcb binding, which we ensure by acquiring a read lock on the * inpcb. This doesn't strictly follow the lock order, so we play * the trylock and retry game; note that we may end up with more * conservative locks than required the second time around, so later * assertions have to accept that. Further analysis of the number of * misses under contention is required. */ sin = (struct sockaddr_in *)addr; INP_RLOCK(inp); if (sin != NULL && (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { INP_RUNLOCK(inp); INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); unlock_udbinfo = 2; } else if ((sin != NULL && ( (sin->sin_addr.s_addr == INADDR_ANY) || (sin->sin_addr.s_addr == INADDR_BROADCAST) || (inp->inp_laddr.s_addr == INADDR_ANY) || (inp->inp_lport == 0))) || (src.sin_family == AF_INET)) { if (!INP_INFO_TRY_RLOCK(&V_udbinfo)) { INP_RUNLOCK(inp); INP_INFO_RLOCK(&V_udbinfo); INP_RLOCK(inp); } unlock_udbinfo = 1; } else unlock_udbinfo = 0; /* * If the IP_SENDSRCADDR control message was specified, override the * source address for this datagram. Its use is invalidated if the * address thus specified is incomplete or clobbers other inpcbs. */ laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { INP_INFO_LOCK_ASSERT(&V_udbinfo); if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { error = EINVAL; goto release; } error = in_pcbbind_setup(inp, (struct sockaddr *)&src, &laddr.s_addr, &lport, td->td_ucred); if (error) goto release; } /* * If a UDP socket has been connected, then a local address/port will * have been selected and bound. * * If a UDP socket has not been connected to, then an explicit * destination address must be used, in which case a local * address/port may not have been selected and bound. */ if (sin != NULL) { INP_LOCK_ASSERT(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto release; } /* * Jail may rewrite the destination address, so let it do * that before we use it. */ if (jailed(td->td_ucred)) prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr); /* * If a local address or port hasn't yet been selected, or if * the destination address needs to be rewritten due to using * a special INADDR_ constant, invoke in_pcbconnect_setup() * to do the heavy lifting. Once a port is selected, we * commit the binding back to the socket; we also commit the * binding of the address if in jail. * * If we already have a valid binding and we're not * requesting a destination address rewrite, use a fast path. */ if (inp->inp_laddr.s_addr == INADDR_ANY || inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { INP_INFO_LOCK_ASSERT(&V_udbinfo); error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); if (error) goto release; /* * XXXRW: Why not commit the port if the address is * !INADDR_ANY? */ /* Commit the local port if newly assigned. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { INP_INFO_WLOCK_ASSERT(&V_udbinfo); INP_WLOCK_ASSERT(inp); /* * Remember addr if jailed, to prevent * rebinding. */ if (jailed(td->td_ucred)) inp->inp_laddr = laddr; inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->inp_lport = 0; error = EAGAIN; goto release; } inp->inp_flags |= INP_ANONPORT; } } else { faddr = sin->sin_addr; fport = sin->sin_port; } } else { INP_LOCK_ASSERT(inp); faddr = inp->inp_faddr; fport = inp->inp_fport; if (faddr.s_addr == INADDR_ANY) { error = ENOTCONN; goto release; } } /* * Calculate data length and get a mbuf for UDP, IP, and possible * link-layer headers. Immediate slide the data pointer back forward * since we won't use that space at this layer. */ M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT); if (m == NULL) { error = ENOBUFS; goto release; } m->m_data += max_linkhdr; m->m_len -= max_linkhdr; m->m_pkthdr.len -= max_linkhdr; /* * Fill in mbuf with extended UDP header and addresses and length put * into network format. */ ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ ui->ui_pr = IPPROTO_UDP; ui->ui_src = laddr; ui->ui_dst = faddr; ui->ui_sport = lport; ui->ui_dport = fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); /* * Set the Don't Fragment bit in the IP header. */ if (inp->inp_flags & INP_DONTFRAG) { struct ip *ip; ip = (struct ip *)&ui->ui_i; ip->ip_off |= IP_DF; } ipflags = 0; if (inp->inp_socket->so_options & SO_DONTROUTE) ipflags |= IP_ROUTETOIF; if (inp->inp_socket->so_options & SO_BROADCAST) ipflags |= IP_ALLOWBROADCAST; if (inp->inp_flags & INP_ONESBCAST) ipflags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif /* * Set up checksum and output datagram. */ if (udp_cksum) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr, htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } else ui->ui_sum = 0; ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len; ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */ V_udpstat.udps_opackets++; if (unlock_udbinfo == 2) INP_INFO_WUNLOCK(&V_udbinfo); else if (unlock_udbinfo == 1) INP_INFO_RUNLOCK(&V_udbinfo); error = ip_output(m, inp->inp_options, NULL, ipflags, inp->inp_moptions, inp); if (unlock_udbinfo == 2) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); return (error); release: if (unlock_udbinfo == 2) { INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_udbinfo); } else if (unlock_udbinfo == 1) { INP_RUNLOCK(inp); INP_INFO_RUNLOCK(&V_udbinfo); } else INP_RUNLOCK(inp); m_freem(m); return (error); } static void udp_abort(struct socket *so) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; soisdisconnected(so); } INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_udbinfo); } static int udp_attach(struct socket *so, int proto, struct thread *td) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp_attach: inp != NULL")); error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); INP_INFO_WLOCK(&V_udbinfo); error = in_pcballoc(so, &V_udbinfo); if (error) { INP_INFO_WUNLOCK(&V_udbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; INP_INFO_WUNLOCK(&V_udbinfo); inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = V_ip_defttl; INP_WUNLOCK(inp); return (0); } static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); error = in_pcbbind(inp, nam, td->td_ucred); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_udbinfo); return (error); } static void udp_close(struct socket *so) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; soisdisconnected(so); } INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_udbinfo); } static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; struct sockaddr_in *sin; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_udbinfo); return (EISCONN); } sin = (struct sockaddr_in *)nam; if (jailed(td->td_ucred)) prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr); error = in_pcbconnect(inp, nam, td->td_ucred); if (error == 0) soisconnected(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_udbinfo); return (error); } static void udp_detach(struct socket *so) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("udp_detach: not disconnected")); INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_udbinfo); } static int udp_disconnect(struct socket *so) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_udbinfo); return (ENOTCONN); } in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_udbinfo); return (0); } static int udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_send: inp == NULL")); return (udp_output(inp, m, addr, control, td)); } int udp_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } struct pr_usrreqs udp_usrreqs = { .pru_abort = udp_abort, .pru_attach = udp_attach, .pru_bind = udp_bind, .pru_connect = udp_connect, .pru_control = in_control, .pru_detach = udp_detach, .pru_disconnect = udp_disconnect, .pru_peeraddr = in_getpeeraddr, .pru_send = udp_send, .pru_soreceive = soreceive_dgram, .pru_sosend = sosend_dgram, .pru_shutdown = udp_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = udp_close, }; Index: head/sys/netinet/vinet.h =================================================================== --- head/sys/netinet/vinet.h (revision 185087) +++ head/sys/netinet/vinet.h (revision 185088) @@ -1,331 +1,333 @@ /*- * Copyright (c) 2006-2008 University of Zagreb * Copyright (c) 2006-2008 FreeBSD Foundation * * This software was developed by the University of Zagreb and the * FreeBSD Foundation under sponsorship by the Stichting NLnet and the * FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NETINET_VINET_H_ #define _NETINET_VINET_H_ #ifdef VIMAGE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct vnet_inet { struct in_ifaddrhashhead *_in_ifaddrhashtbl; struct in_ifaddrhead _in_ifaddrhead; u_long _in_ifaddrhmask; struct in_multihead _in_multihead; int _arpt_keep; int _arp_maxtries; int _useloopback; int _arp_proxyall; int _subnetsarelocal; int _sameprefixcarponly; int _ipforwarding; int _ipstealth; int _ipfastforward_active; int _ipsendredirects; int _ip_defttl; int _ip_keepfaith; int _ip_sendsourcequench; int _ip_do_randomid; int _ip_checkinterface; u_short _ip_id; uma_zone_t _ipq_zone; int _nipq; /* Total # of reass queues */ int _maxnipq; /* Admin. limit on # reass queues. */ int _maxfragsperpacket; TAILQ_HEAD(ipqhead, ipq) _ipq[IPREASS_NHASH]; struct inpcbhead _tcb; /* head of queue of active tcpcb's */ struct inpcbinfo _tcbinfo; struct tcpstat _tcpstat; /* tcp statistics */ struct tcp_hostcache _tcp_hostcache; struct callout _tcp_hc_callout; struct tcp_syncache _tcp_syncache; struct inpcbhead _divcb; struct inpcbinfo _divcbinfo; TAILQ_HEAD(, tcptw) _twq_2msl; int _tcp_sc_rst_sock_fail; int _tcp_mssdflt; int _tcp_v6mssdflt; int _tcp_minmss; int _tcp_do_rfc1323; int _icmp_may_rst; int _tcp_isn_reseed_interval; int _tcp_inflight_enable; int _tcp_inflight_rttthresh; int _tcp_inflight_min; int _tcp_inflight_max; int _tcp_inflight_stab; int _nolocaltimewait; int _path_mtu_discovery; int _ss_fltsz; int _ss_fltsz_local; int _tcp_do_newreno; int _tcp_do_tso; int _tcp_do_autosndbuf; int _tcp_autosndbuf_inc; int _tcp_autosndbuf_max; int _tcp_do_sack; int _tcp_sack_maxholes; int _tcp_sack_globalmaxholes; int _tcp_sack_globalholes; int _blackhole; int _tcp_delack_enabled; int _drop_synfin; int _tcp_do_rfc3042; int _tcp_do_rfc3390; int _tcp_do_ecn; int _tcp_ecn_maxretries; int _tcp_insecure_rst; int _tcp_do_autorcvbuf; int _tcp_autorcvbuf_inc; int _tcp_autorcvbuf_max; int _tcp_reass_maxseg; int _tcp_reass_qsize; int _tcp_reass_maxqlen; int _tcp_reass_overflows; u_char _isn_secret[32]; int _isn_last_reseed; u_int32_t _isn_offset; u_int32_t _isn_offset_old; MD5_CTX _isn_ctx; struct inpcbhead _udb; struct inpcbinfo _udbinfo; struct udpstat _udpstat; int _udp_blackhole; struct inpcbhead _ripcb; struct inpcbinfo _ripcbinfo; struct socket *_ip_mrouter; struct socket *_ip_rsvpd; int _ip_rsvp_on; int _rsvp_on; struct icmpstat _icmpstat; struct ipstat _ipstat; struct igmpstat _igmpstat; SLIST_HEAD(, router_info) _router_info_head; int _rtq_timeout; int _rtq_reallyold; int _rtq_minreallyold; int _rtq_toomany; struct callout _rtq_timer; int _ipport_lowfirstauto; int _ipport_lowlastauto; int _ipport_firstauto; int _ipport_lastauto; int _ipport_hifirstauto; int _ipport_hilastauto; int _ipport_reservedhigh; int _ipport_reservedlow; int _ipport_randomized; int _ipport_randomcps; int _ipport_randomtime; int _ipport_stoprandom; int _ipport_tcpallocs; int _ipport_tcplastcount; int _icmpmaskrepl; u_int _icmpmaskfake; int _drop_redirect; int _log_redirect; int _icmplim; int _icmplim_output; char _reply_src[IFNAMSIZ]; int _icmp_rfi; int _icmp_quotelen; int _icmpbmcastecho; }; #endif /* * Symbol translation macros */ #define INIT_VNET_INET(vnet) \ INIT_FROM_VNET(vnet, VNET_MOD_INET, struct vnet_inet, vnet_inet) #define VNET_INET(sym) VSYM(vnet_inet, sym) #define V_arp_maxtries VNET_INET(arp_maxtries) #define V_arp_proxyall VNET_INET(arp_proxyall) #define V_arpt_keep VNET_INET(arpt_keep) #define V_arpt_prune VNET_INET(arpt_prune) #define V_blackhole VNET_INET(blackhole) #define V_divcb VNET_INET(divcb) #define V_divcbinfo VNET_INET(divcbinfo) #define V_drop_redirect VNET_INET(drop_redirect) #define V_drop_synfin VNET_INET(drop_synfin) #define V_icmp_may_rst VNET_INET(icmp_may_rst) #define V_icmp_quotelen VNET_INET(icmp_quotelen) #define V_icmp_rfi VNET_INET(icmp_rfi) #define V_icmpbmcastecho VNET_INET(icmpbmcastecho) #define V_icmplim VNET_INET(icmplim) #define V_icmplim_output VNET_INET(icmplim_output) #define V_icmpmaskfake VNET_INET(icmpmaskfake) #define V_icmpmaskrepl VNET_INET(icmpmaskrepl) #define V_icmpstat VNET_INET(icmpstat) #define V_igmpstat VNET_INET(igmpstat) #define V_in_ifaddrhashtbl VNET_INET(in_ifaddrhashtbl) #define V_in_ifaddrhead VNET_INET(in_ifaddrhead) #define V_in_ifaddrhmask VNET_INET(in_ifaddrhmask) #define V_in_multihead VNET_INET(in_multihead) #define V_ip_checkinterface VNET_INET(ip_checkinterface) #define V_ip_defttl VNET_INET(ip_defttl) #define V_ip_do_randomid VNET_INET(ip_do_randomid) #define V_ip_id VNET_INET(ip_id) #define V_ip_keepfaith VNET_INET(ip_keepfaith) #define V_ip_mrouter VNET_INET(ip_mrouter) #define V_ip_rsvp_on VNET_INET(ip_rsvp_on) #define V_ip_rsvpd VNET_INET(ip_rsvpd) #define V_ip_sendsourcequench VNET_INET(ip_sendsourcequench) #define V_ipfastforward_active VNET_INET(ipfastforward_active) #define V_ipforwarding VNET_INET(ipforwarding) #define V_ipport_firstauto VNET_INET(ipport_firstauto) #define V_ipport_hifirstauto VNET_INET(ipport_hifirstauto) #define V_ipport_hilastauto VNET_INET(ipport_hilastauto) #define V_ipport_lastauto VNET_INET(ipport_lastauto) #define V_ipport_lowfirstauto VNET_INET(ipport_lowfirstauto) #define V_ipport_lowlastauto VNET_INET(ipport_lowlastauto) #define V_ipport_randomcps VNET_INET(ipport_randomcps) #define V_ipport_randomized VNET_INET(ipport_randomized) #define V_ipport_randomtime VNET_INET(ipport_randomtime) #define V_ipport_reservedhigh VNET_INET(ipport_reservedhigh) #define V_ipport_reservedlow VNET_INET(ipport_reservedlow) #define V_ipport_stoprandom VNET_INET(ipport_stoprandom) #define V_ipport_tcpallocs VNET_INET(ipport_tcpallocs) #define V_ipport_tcplastcount VNET_INET(ipport_tcplastcount) #define V_ipq VNET_INET(ipq) #define V_ipq_zone VNET_INET(ipq_zone) #define V_ipsendredirects VNET_INET(ipsendredirects) #define V_ipstat VNET_INET(ipstat) #define V_ipstealth VNET_INET(ipstealth) #define V_isn_ctx VNET_INET(isn_ctx) #define V_isn_last_reseed VNET_INET(isn_last_reseed) #define V_isn_offset VNET_INET(isn_offset) #define V_isn_offset_old VNET_INET(isn_offset_old) #define V_isn_secret VNET_INET(isn_secret) #define V_llinfo_arp VNET_INET(llinfo_arp) #define V_log_redirect VNET_INET(log_redirect) #define V_maxfragsperpacket VNET_INET(maxfragsperpacket) #define V_maxnipq VNET_INET(maxnipq) #define V_nipq VNET_INET(nipq) #define V_nolocaltimewait VNET_INET(nolocaltimewait) #define V_path_mtu_discovery VNET_INET(path_mtu_discovery) #define V_reply_src VNET_INET(reply_src) #define V_ripcb VNET_INET(ripcb) #define V_ripcbinfo VNET_INET(ripcbinfo) #define V_router_info_head VNET_INET(router_info_head) #define V_rsvp_on VNET_INET(rsvp_on) #define V_rtq_minreallyold VNET_INET(rtq_minreallyold) #define V_rtq_reallyold VNET_INET(rtq_reallyold) #define V_rtq_timeout VNET_INET(rtq_timeout) #define V_rtq_timer VNET_INET(rtq_timer) #define V_rtq_toomany VNET_INET(rtq_toomany) #define V_sameprefixcarponly VNET_INET(sameprefixcarponly) #define V_ss_fltsz VNET_INET(ss_fltsz) #define V_ss_fltsz_local VNET_INET(ss_fltsz_local) #define V_subnetsarelocal VNET_INET(subnetsarelocal) #define V_tcb VNET_INET(tcb) #define V_tcbinfo VNET_INET(tcbinfo) #define V_tcp_autorcvbuf_inc VNET_INET(tcp_autorcvbuf_inc) #define V_tcp_autorcvbuf_max VNET_INET(tcp_autorcvbuf_max) #define V_tcp_autosndbuf_inc VNET_INET(tcp_autosndbuf_inc) #define V_tcp_autosndbuf_max VNET_INET(tcp_autosndbuf_max) #define V_tcp_delack_enabled VNET_INET(tcp_delack_enabled) #define V_tcp_do_autorcvbuf VNET_INET(tcp_do_autorcvbuf) #define V_tcp_do_autosndbuf VNET_INET(tcp_do_autosndbuf) #define V_tcp_do_ecn VNET_INET(tcp_do_ecn) #define V_tcp_do_newreno VNET_INET(tcp_do_newreno) #define V_tcp_do_rfc1323 VNET_INET(tcp_do_rfc1323) #define V_tcp_do_rfc3042 VNET_INET(tcp_do_rfc3042) #define V_tcp_do_rfc3390 VNET_INET(tcp_do_rfc3390) #define V_tcp_do_sack VNET_INET(tcp_do_sack) #define V_tcp_do_tso VNET_INET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET_INET(tcp_ecn_maxretries) #define V_tcp_hc_callout VNET_INET(tcp_hc_callout) #define V_tcp_hostcache VNET_INET(tcp_hostcache) #define V_tcp_inflight_enable VNET_INET(tcp_inflight_enable) #define V_tcp_inflight_max VNET_INET(tcp_inflight_max) #define V_tcp_inflight_min VNET_INET(tcp_inflight_min) #define V_tcp_inflight_rttthresh VNET_INET(tcp_inflight_rttthresh) #define V_tcp_inflight_stab VNET_INET(tcp_inflight_stab) #define V_tcp_insecure_rst VNET_INET(tcp_insecure_rst) #define V_tcp_isn_reseed_interval VNET_INET(tcp_isn_reseed_interval) #define V_tcp_minmss VNET_INET(tcp_minmss) #define V_tcp_mssdflt VNET_INET(tcp_mssdflt) #define V_tcp_reass_maxqlen VNET_INET(tcp_reass_maxqlen) #define V_tcp_reass_maxseg VNET_INET(tcp_reass_maxseg) #define V_tcp_reass_overflows VNET_INET(tcp_reass_overflows) #define V_tcp_reass_qsize VNET_INET(tcp_reass_qsize) #define V_tcp_sack_globalholes VNET_INET(tcp_sack_globalholes) #define V_tcp_sack_globalmaxholes VNET_INET(tcp_sack_globalmaxholes) #define V_tcp_sack_maxholes VNET_INET(tcp_sack_maxholes) #define V_tcp_sc_rst_sock_fail VNET_INET(tcp_sc_rst_sock_fail) #define V_tcp_syncache VNET_INET(tcp_syncache) +#define V_tcp_syncookies VNET_INET(tcp_syncookies) +#define V_tcp_syncookiesonly VNET_INET(tcp_syncookiesonly) #define V_tcp_v6mssdflt VNET_INET(tcp_v6mssdflt) #define V_tcpstat VNET_INET(tcpstat) #define V_twq_2msl VNET_INET(twq_2msl) #define V_udb VNET_INET(udb) #define V_udbinfo VNET_INET(udbinfo) #define V_udp_blackhole VNET_INET(udp_blackhole) #define V_udpstat VNET_INET(udpstat) #define V_useloopback VNET_INET(useloopback) #endif /* !_NETINET_VINET_H_ */ Index: head/sys/netinet6/frag6.c =================================================================== --- head/sys/netinet6/frag6.c (revision 185087) +++ head/sys/netinet6/frag6.c (revision 185088) @@ -1,792 +1,794 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: frag6.c,v 1.33 2002/01/07 11:34:48 kjc Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for ECN definitions */ #include /* for ECN definitions */ #include /* * Define it to get a correct behavior on per-interface statistics. * You will need to perform an extra routing table lookup, per fragment, * to do it. This may, or may not be, a performance hit. */ #define IN6_IFSTAT_STRICT static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *); static void frag6_deq(struct ip6asfrag *); static void frag6_insque(struct ip6q *, struct ip6q *); static void frag6_remque(struct ip6q *); static void frag6_freef(struct ip6q *); static struct mtx ip6qlock; /* * These fields all protected by ip6qlock. */ +#ifdef VIMAGE_GLOBALS static u_int frag6_nfragpackets; static u_int frag6_nfrags; static struct ip6q ip6q; /* ip6 reassemble queue */ +#endif #define IP6Q_LOCK_INIT() mtx_init(&ip6qlock, "ip6qlock", NULL, MTX_DEF); #define IP6Q_LOCK() mtx_lock(&ip6qlock) #define IP6Q_TRYLOCK() mtx_trylock(&ip6qlock) #define IP6Q_LOCK_ASSERT() mtx_assert(&ip6qlock, MA_OWNED) #define IP6Q_UNLOCK() mtx_unlock(&ip6qlock) static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); /* * Initialise reassembly queue and fragment identifier. */ static void frag6_change(void *tag) { INIT_VNET_INET6(curvnet); V_ip6_maxfragpackets = nmbclusters / 4; V_ip6_maxfrags = nmbclusters / 4; } void frag6_init(void) { INIT_VNET_INET6(curvnet); V_ip6_maxfragpackets = nmbclusters / 4; V_ip6_maxfrags = nmbclusters / 4; EVENTHANDLER_REGISTER(nmbclusters_change, frag6_change, NULL, EVENTHANDLER_PRI_ANY); IP6Q_LOCK_INIT(); V_ip6q.ip6q_next = V_ip6q.ip6q_prev = &V_ip6q; } /* * In RFC2460, fragment and reassembly rule do not agree with each other, * in terms of next header field handling in fragment header. * While the sender will use the same value for all of the fragmented packets, * receiver is suggested not to check the consistency. * * fragment rule (p20): * (2) A Fragment header containing: * The Next Header value that identifies the first header of * the Fragmentable Part of the original packet. * -> next header field is same for all fragments * * reassembly rule (p21): * The Next Header field of the last header of the Unfragmentable * Part is obtained from the Next Header field of the first * fragment's Fragment header. * -> should grab it from the first fragment only * * The following note also contradicts with fragment rule - noone is going to * send different fragment with different next header field. * * additional note (p22): * The Next Header values in the Fragment headers of different * fragments of the same original packet may differ. Only the value * from the Offset zero fragment packet is used for reassembly. * -> should grab it from the first fragment only * * There is no explicit reason given in the RFC. Historical reason maybe? */ /* * Fragment input */ int frag6_input(struct mbuf **mp, int *offp, int proto) { INIT_VNET_INET6(curvnet); struct mbuf *m = *mp, *t; struct ip6_hdr *ip6; struct ip6_frag *ip6f; struct ip6q *q6; struct ip6asfrag *af6, *ip6af, *af6dwn; #ifdef IN6_IFSTAT_STRICT struct in6_ifaddr *ia; #endif int offset = *offp, nxt, i, next; int first_frag = 0; int fragoff, frgpartlen; /* must be larger than u_int16_t */ struct ifnet *dstifp; u_int8_t ecn, ecn0; #if 0 char ip6buf[INET6_ADDRSTRLEN]; #endif ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, offset, sizeof(struct ip6_frag), IPPROTO_DONE); ip6f = (struct ip6_frag *)((caddr_t)ip6 + offset); #else IP6_EXTHDR_GET(ip6f, struct ip6_frag *, m, offset, sizeof(*ip6f)); if (ip6f == NULL) return (IPPROTO_DONE); #endif dstifp = NULL; #ifdef IN6_IFSTAT_STRICT /* find the destination interface of the packet. */ if ((ia = ip6_getdstifaddr(m)) != NULL) dstifp = ia->ia_ifp; #else /* we are violating the spec, this is not the destination interface */ if ((m->m_flags & M_PKTHDR) != 0) dstifp = m->m_pkthdr.rcvif; #endif /* jumbo payload can't contain a fragment header */ if (ip6->ip6_plen == 0) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset); in6_ifstat_inc(dstifp, ifs6_reass_fail); return IPPROTO_DONE; } /* * check whether fragment packet's fragment length is * multiple of 8 octets. * sizeof(struct ip6_frag) == 8 * sizeof(struct ip6_hdr) = 40 */ if ((ip6f->ip6f_offlg & IP6F_MORE_FRAG) && (((ntohs(ip6->ip6_plen) - offset) & 0x7) != 0)) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offsetof(struct ip6_hdr, ip6_plen)); in6_ifstat_inc(dstifp, ifs6_reass_fail); return IPPROTO_DONE; } V_ip6stat.ip6s_fragments++; in6_ifstat_inc(dstifp, ifs6_reass_reqd); /* offset now points to data portion */ offset += sizeof(struct ip6_frag); IP6Q_LOCK(); /* * Enforce upper bound on number of fragments. * If maxfrag is 0, never accept fragments. * If maxfrag is -1, accept all fragments without limitation. */ if (V_ip6_maxfrags < 0) ; else if (V_frag6_nfrags >= (u_int)V_ip6_maxfrags) goto dropfrag; for (q6 = V_ip6q.ip6q_next; q6 != &V_ip6q; q6 = q6->ip6q_next) if (ip6f->ip6f_ident == q6->ip6q_ident && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst) #ifdef MAC && mac_ip6q_match(m, q6) #endif ) break; if (q6 == &V_ip6q) { /* * the first fragment to arrive, create a reassembly queue. */ first_frag = 1; /* * Enforce upper bound on number of fragmented packets * for which we attempt reassembly; * If maxfragpackets is 0, never accept fragments. * If maxfragpackets is -1, accept all fragments without * limitation. */ if (V_ip6_maxfragpackets < 0) ; else if (V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets) goto dropfrag; V_frag6_nfragpackets++; q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE, M_NOWAIT); if (q6 == NULL) goto dropfrag; bzero(q6, sizeof(*q6)); #ifdef MAC if (mac_ip6q_init(q6, M_NOWAIT) != 0) { free(q6, M_FTABLE); goto dropfrag; } mac_ip6q_create(m, q6); #endif frag6_insque(q6, &V_ip6q); /* ip6q_nxt will be filled afterwards, from 1st fragment */ q6->ip6q_down = q6->ip6q_up = (struct ip6asfrag *)q6; #ifdef notyet q6->ip6q_nxtp = (u_char *)nxtp; #endif q6->ip6q_ident = ip6f->ip6f_ident; q6->ip6q_ttl = IPV6_FRAGTTL; q6->ip6q_src = ip6->ip6_src; q6->ip6q_dst = ip6->ip6_dst; q6->ip6q_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; q6->ip6q_unfrglen = -1; /* The 1st fragment has not arrived. */ q6->ip6q_nfrag = 0; } /* * If it's the 1st fragment, record the length of the * unfragmentable part and the next header of the fragment header. */ fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK); if (fragoff == 0) { q6->ip6q_unfrglen = offset - sizeof(struct ip6_hdr) - sizeof(struct ip6_frag); q6->ip6q_nxt = ip6f->ip6f_nxt; } /* * Check that the reassembled packet would not exceed 65535 bytes * in size. * If it would exceed, discard the fragment and return an ICMP error. */ frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset; if (q6->ip6q_unfrglen >= 0) { /* The 1st fragment has already arrived. */ if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); IP6Q_UNLOCK(); return (IPPROTO_DONE); } } else if (fragoff + frgpartlen > IPV6_MAXPACKET) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); IP6Q_UNLOCK(); return (IPPROTO_DONE); } /* * If it's the first fragment, do the above check for each * fragment already stored in the reassembly queue. */ if (fragoff == 0) { for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; af6 = af6dwn) { af6dwn = af6->ip6af_down; if (q6->ip6q_unfrglen + af6->ip6af_off + af6->ip6af_frglen > IPV6_MAXPACKET) { struct mbuf *merr = IP6_REASS_MBUF(af6); struct ip6_hdr *ip6err; int erroff = af6->ip6af_offset; /* dequeue the fragment. */ frag6_deq(af6); free(af6, M_FTABLE); /* adjust pointer. */ ip6err = mtod(merr, struct ip6_hdr *); /* * Restore source and destination addresses * in the erroneous IPv6 header. */ ip6err->ip6_src = q6->ip6q_src; ip6err->ip6_dst = q6->ip6q_dst; icmp6_error(merr, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); } } } ip6af = (struct ip6asfrag *)malloc(sizeof(struct ip6asfrag), M_FTABLE, M_NOWAIT); if (ip6af == NULL) goto dropfrag; bzero(ip6af, sizeof(*ip6af)); ip6af->ip6af_mff = ip6f->ip6f_offlg & IP6F_MORE_FRAG; ip6af->ip6af_off = fragoff; ip6af->ip6af_frglen = frgpartlen; ip6af->ip6af_offset = offset; IP6_REASS_MBUF(ip6af) = m; if (first_frag) { af6 = (struct ip6asfrag *)q6; goto insert; } /* * Handle ECN by comparing this segment with the first one; * if CE is set, do not lose CE. * drop if CE and not-ECT are mixed for the same packet. */ ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; ecn0 = q6->ip6q_ecn; if (ecn == IPTOS_ECN_CE) { if (ecn0 == IPTOS_ECN_NOTECT) { free(ip6af, M_FTABLE); goto dropfrag; } if (ecn0 != IPTOS_ECN_CE) q6->ip6q_ecn = IPTOS_ECN_CE; } if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) { free(ip6af, M_FTABLE); goto dropfrag; } /* * Find a segment which begins after this one does. */ for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; af6 = af6->ip6af_down) if (af6->ip6af_off > ip6af->ip6af_off) break; #if 0 /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us. */ if (af6->ip6af_up != (struct ip6asfrag *)q6) { i = af6->ip6af_up->ip6af_off + af6->ip6af_up->ip6af_frglen - ip6af->ip6af_off; if (i > 0) { if (i >= ip6af->ip6af_frglen) goto dropfrag; m_adj(IP6_REASS_MBUF(ip6af), i); ip6af->ip6af_off += i; ip6af->ip6af_frglen -= i; } } /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ while (af6 != (struct ip6asfrag *)q6 && ip6af->ip6af_off + ip6af->ip6af_frglen > af6->ip6af_off) { i = (ip6af->ip6af_off + ip6af->ip6af_frglen) - af6->ip6af_off; if (i < af6->ip6af_frglen) { af6->ip6af_frglen -= i; af6->ip6af_off += i; m_adj(IP6_REASS_MBUF(af6), i); break; } af6 = af6->ip6af_down; m_freem(IP6_REASS_MBUF(af6->ip6af_up)); frag6_deq(af6->ip6af_up); } #else /* * If the incoming framgent overlaps some existing fragments in * the reassembly queue, drop it, since it is dangerous to override * existing fragments from a security point of view. * We don't know which fragment is the bad guy - here we trust * fragment that came in earlier, with no real reason. * * Note: due to changes after disabling this part, mbuf passed to * m_adj() below now does not meet the requirement. */ if (af6->ip6af_up != (struct ip6asfrag *)q6) { i = af6->ip6af_up->ip6af_off + af6->ip6af_up->ip6af_frglen - ip6af->ip6af_off; if (i > 0) { #if 0 /* suppress the noisy log */ log(LOG_ERR, "%d bytes of a fragment from %s " "overlaps the previous fragment\n", i, ip6_sprintf(ip6buf, &q6->ip6q_src)); #endif free(ip6af, M_FTABLE); goto dropfrag; } } if (af6 != (struct ip6asfrag *)q6) { i = (ip6af->ip6af_off + ip6af->ip6af_frglen) - af6->ip6af_off; if (i > 0) { #if 0 /* suppress the noisy log */ log(LOG_ERR, "%d bytes of a fragment from %s " "overlaps the succeeding fragment", i, ip6_sprintf(ip6buf, &q6->ip6q_src)); #endif free(ip6af, M_FTABLE); goto dropfrag; } } #endif insert: #ifdef MAC if (!first_frag) mac_ip6q_update(m, q6); #endif /* * Stick new segment in its place; * check for complete reassembly. * Move to front of packet queue, as we are * the most recently active fragmented packet. */ frag6_enq(ip6af, af6->ip6af_up); V_frag6_nfrags++; q6->ip6q_nfrag++; #if 0 /* xxx */ if (q6 != V_ip6q.ip6q_next) { frag6_remque(q6); frag6_insque(q6, &V_ip6q); } #endif next = 0; for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; af6 = af6->ip6af_down) { if (af6->ip6af_off != next) { IP6Q_UNLOCK(); return IPPROTO_DONE; } next += af6->ip6af_frglen; } if (af6->ip6af_up->ip6af_mff) { IP6Q_UNLOCK(); return IPPROTO_DONE; } /* * Reassembly is complete; concatenate fragments. */ ip6af = q6->ip6q_down; t = m = IP6_REASS_MBUF(ip6af); af6 = ip6af->ip6af_down; frag6_deq(ip6af); while (af6 != (struct ip6asfrag *)q6) { af6dwn = af6->ip6af_down; frag6_deq(af6); while (t->m_next) t = t->m_next; t->m_next = IP6_REASS_MBUF(af6); m_adj(t->m_next, af6->ip6af_offset); free(af6, M_FTABLE); af6 = af6dwn; } /* adjust offset to point where the original next header starts */ offset = ip6af->ip6af_offset - sizeof(struct ip6_frag); free(ip6af, M_FTABLE); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons((u_short)next + offset - sizeof(struct ip6_hdr)); if (q6->ip6q_ecn == IPTOS_ECN_CE) ip6->ip6_flow |= htonl(IPTOS_ECN_CE << 20); nxt = q6->ip6q_nxt; #ifdef notyet *q6->ip6q_nxtp = (u_char)(nxt & 0xff); #endif /* Delete frag6 header */ if (m->m_len >= offset + sizeof(struct ip6_frag)) { /* This is the only possible case with !PULLDOWN_TEST */ ovbcopy((caddr_t)ip6, (caddr_t)ip6 + sizeof(struct ip6_frag), offset); m->m_data += sizeof(struct ip6_frag); m->m_len -= sizeof(struct ip6_frag); } else { /* this comes with no copy if the boundary is on cluster */ if ((t = m_split(m, offset, M_DONTWAIT)) == NULL) { frag6_remque(q6); V_frag6_nfrags -= q6->ip6q_nfrag; #ifdef MAC mac_ip6q_destroy(q6); #endif free(q6, M_FTABLE); V_frag6_nfragpackets--; goto dropfrag; } m_adj(t, sizeof(struct ip6_frag)); m_cat(m, t); } /* * Store NXT to the original. */ { char *prvnxtp = ip6_get_prevhdr(m, offset); /* XXX */ *prvnxtp = nxt; } frag6_remque(q6); V_frag6_nfrags -= q6->ip6q_nfrag; #ifdef MAC mac_ip6q_reassemble(q6, m); mac_ip6q_destroy(q6); #endif free(q6, M_FTABLE); V_frag6_nfragpackets--; if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */ int plen = 0; for (t = m; t; t = t->m_next) plen += t->m_len; m->m_pkthdr.len = plen; } V_ip6stat.ip6s_reassembled++; in6_ifstat_inc(dstifp, ifs6_reass_ok); /* * Tell launch routine the next header */ *mp = m; *offp = offset; IP6Q_UNLOCK(); return nxt; dropfrag: IP6Q_UNLOCK(); in6_ifstat_inc(dstifp, ifs6_reass_fail); V_ip6stat.ip6s_fragdropped++; m_freem(m); return IPPROTO_DONE; } /* * Free a fragment reassembly header and all * associated datagrams. */ void frag6_freef(struct ip6q *q6) { INIT_VNET_INET6(curvnet); struct ip6asfrag *af6, *down6; IP6Q_LOCK_ASSERT(); for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; af6 = down6) { struct mbuf *m = IP6_REASS_MBUF(af6); down6 = af6->ip6af_down; frag6_deq(af6); /* * Return ICMP time exceeded error for the 1st fragment. * Just free other fragments. */ if (af6->ip6af_off == 0) { struct ip6_hdr *ip6; /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* restore source and destination addresses */ ip6->ip6_src = q6->ip6q_src; ip6->ip6_dst = q6->ip6q_dst; icmp6_error(m, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_REASSEMBLY, 0); } else m_freem(m); free(af6, M_FTABLE); } frag6_remque(q6); V_frag6_nfrags -= q6->ip6q_nfrag; #ifdef MAC mac_ip6q_destroy(q6); #endif free(q6, M_FTABLE); V_frag6_nfragpackets--; } /* * Put an ip fragment on a reassembly chain. * Like insque, but pointers in middle of structure. */ void frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6) { IP6Q_LOCK_ASSERT(); af6->ip6af_up = up6; af6->ip6af_down = up6->ip6af_down; up6->ip6af_down->ip6af_up = af6; up6->ip6af_down = af6; } /* * To frag6_enq as remque is to insque. */ void frag6_deq(struct ip6asfrag *af6) { IP6Q_LOCK_ASSERT(); af6->ip6af_up->ip6af_down = af6->ip6af_down; af6->ip6af_down->ip6af_up = af6->ip6af_up; } void frag6_insque(struct ip6q *new, struct ip6q *old) { IP6Q_LOCK_ASSERT(); new->ip6q_prev = old; new->ip6q_next = old->ip6q_next; old->ip6q_next->ip6q_prev= new; old->ip6q_next = new; } void frag6_remque(struct ip6q *p6) { IP6Q_LOCK_ASSERT(); p6->ip6q_prev->ip6q_next = p6->ip6q_next; p6->ip6q_next->ip6q_prev = p6->ip6q_prev; } /* * IPv6 reassembling timer processing; * if a timer expires on a reassembly * queue, discard it. */ void frag6_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); struct ip6q *q6; IP6Q_LOCK(); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INIT_VNET_INET6(vnet_iter); q6 = V_ip6q.ip6q_next; if (q6) while (q6 != &V_ip6q) { --q6->ip6q_ttl; q6 = q6->ip6q_next; if (q6->ip6q_prev->ip6q_ttl == 0) { V_ip6stat.ip6s_fragtimeout++; /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(q6->ip6q_prev); } } /* * If we are over the maximum number of fragments * (due to the limit being lowered), drain off * enough to get down to the new limit. */ while (V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets && V_ip6q.ip6q_prev) { V_ip6stat.ip6s_fragoverflow++; /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(V_ip6q.ip6q_prev); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); IP6Q_UNLOCK(); #if 0 /* * Routing changes might produce a better route than we last used; * make sure we notice eventually, even if forwarding only for one * destination and the cache is never replaced. */ if (V_ip6_forward_rt.ro_rt) { RTFREE(V_ip6_forward_rt.ro_rt); V_ip6_forward_rt.ro_rt = 0; } if (ipsrcchk_rt.ro_rt) { RTFREE(ipsrcchk_rt.ro_rt); ipsrcchk_rt.ro_rt = 0; } #endif } /* * Drain off all datagram fragments. */ void frag6_drain(void) { VNET_ITERATOR_DECL(vnet_iter); if (IP6Q_TRYLOCK() == 0) return; VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INIT_VNET_INET6(vnet_iter); while (V_ip6q.ip6q_next != &V_ip6q) { V_ip6stat.ip6s_fragdropped++; /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(V_ip6q.ip6q_next); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); IP6Q_UNLOCK(); } Index: head/sys/netinet6/icmp6.c =================================================================== --- head/sys/netinet6/icmp6.c (revision 185087) +++ head/sys/netinet6/icmp6.c (revision 185088) @@ -1,2819 +1,2822 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif extern struct domain inet6domain; - -struct icmp6stat icmp6stat; - extern struct inpcbinfo ripcbinfo; extern struct inpcbhead ripcb; extern int icmp6errppslim; -static int icmp6errpps_count = 0; -static struct timeval icmp6errppslim_last; extern int icmp6_nodeinfo; +#ifdef VIMAGE_GLOBALS +struct icmp6stat icmp6stat; +static int icmp6errpps_count; +static struct timeval icmp6errppslim_last; +#endif + static void icmp6_errcount(struct icmp6errstat *, int, int); static int icmp6_rip6_input(struct mbuf **, int); static int icmp6_ratelimit(const struct in6_addr *, const int, const int); static const char *icmp6_redirect_diag __P((struct in6_addr *, struct in6_addr *, struct in6_addr *)); static struct mbuf *ni6_input(struct mbuf *, int); static struct mbuf *ni6_nametodns(const char *, int, int); static int ni6_dnsmatch(const char *, int, const char *, int); static int ni6_addrs __P((struct icmp6_nodeinfo *, struct mbuf *, struct ifnet **, struct in6_addr *)); static int ni6_store_addrs __P((struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, struct ifnet *, int)); static int icmp6_notify_error(struct mbuf **, int, int, int); void icmp6_init(void) { INIT_VNET_INET6(curvnet); + + V_icmp6errpps_count = 0; mld6_init(); } static void icmp6_errcount(struct icmp6errstat *stat, int type, int code) { switch (type) { case ICMP6_DST_UNREACH: switch (code) { case ICMP6_DST_UNREACH_NOROUTE: stat->icp6errs_dst_unreach_noroute++; return; case ICMP6_DST_UNREACH_ADMIN: stat->icp6errs_dst_unreach_admin++; return; case ICMP6_DST_UNREACH_BEYONDSCOPE: stat->icp6errs_dst_unreach_beyondscope++; return; case ICMP6_DST_UNREACH_ADDR: stat->icp6errs_dst_unreach_addr++; return; case ICMP6_DST_UNREACH_NOPORT: stat->icp6errs_dst_unreach_noport++; return; } break; case ICMP6_PACKET_TOO_BIG: stat->icp6errs_packet_too_big++; return; case ICMP6_TIME_EXCEEDED: switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: stat->icp6errs_time_exceed_transit++; return; case ICMP6_TIME_EXCEED_REASSEMBLY: stat->icp6errs_time_exceed_reassembly++; return; } break; case ICMP6_PARAM_PROB: switch (code) { case ICMP6_PARAMPROB_HEADER: stat->icp6errs_paramprob_header++; return; case ICMP6_PARAMPROB_NEXTHEADER: stat->icp6errs_paramprob_nextheader++; return; case ICMP6_PARAMPROB_OPTION: stat->icp6errs_paramprob_option++; return; } break; case ND_REDIRECT: stat->icp6errs_redirect++; return; } stat->icp6errs_unknown++; } /* * A wrapper function for icmp6_error() necessary when the erroneous packet * may not contain enough scope zone information. */ void icmp6_error2(struct mbuf *m, int type, int code, int param, struct ifnet *ifp) { INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6; if (ifp == NULL) return; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif ip6 = mtod(m, struct ip6_hdr *); if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0) return; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) return; icmp6_error(m, type, code, param); } /* * Generate an error packet of type error in response to bad IP6 packet. */ void icmp6_error(struct mbuf *m, int type, int code, int param) { INIT_VNET_INET6(curvnet); struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; u_int preplen; int off; int nxt; V_icmp6stat.icp6s_error++; /* count per-type-code statistics */ icmp6_errcount(&V_icmp6stat.icp6s_outerrhist, type, code); #ifdef M_DECRYPTED /*not openbsd*/ if (m->m_flags & M_DECRYPTED) { V_icmp6stat.icp6s_canterror++; goto freeit; } #endif #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif oip6 = mtod(m, struct ip6_hdr *); /* * If the destination address of the erroneous packet is a multicast * address, or the packet was sent using link-layer multicast, * we should basically suppress sending an error (RFC 2463, Section * 2.4). * We have two exceptions (the item e.2 in that section): * - the Pakcet Too Big message can be sent for path MTU discovery. * - the Parameter Problem Message that can be allowed an icmp6 error * in the option type field. This check has been done in * ip6_unknown_opt(), so we can just check the type and code. */ if ((m->m_flags & (M_BCAST|M_MCAST) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && (type != ICMP6_PACKET_TOO_BIG && (type != ICMP6_PARAM_PROB || code != ICMP6_PARAMPROB_OPTION))) goto freeit; /* * RFC 2463, 2.4 (e.5): source address check. * XXX: the case of anycast source? */ if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_src)) goto freeit; /* * If we are about to send ICMPv6 against ICMPv6 error/redirect, * don't do it. */ nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off >= 0 && nxt == IPPROTO_ICMPV6) { struct icmp6_hdr *icp; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), ); icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off, sizeof(*icp)); if (icp == NULL) { V_icmp6stat.icp6s_tooshort++; return; } #endif if (icp->icmp6_type < ICMP6_ECHO_REQUEST || icp->icmp6_type == ND_REDIRECT) { /* * ICMPv6 error * Special case: for redirect (which is * informational) we must not send icmp6 error. */ V_icmp6stat.icp6s_canterror++; goto freeit; } else { /* ICMPv6 informational - send the error */ } } else { /* non-ICMPv6 - send the error */ } oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */ /* Finally, do rate limitation check. */ if (icmp6_ratelimit(&oip6->ip6_src, type, code)) { V_icmp6stat.icp6s_toofreq++; goto freeit; } /* * OK, ICMP6 can be generated. */ if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN) m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len); preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); M_PREPEND(m, preplen, M_DONTWAIT); if (m && m->m_len < preplen) m = m_pullup(m, preplen); if (m == NULL) { nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__)); return; } nip6 = mtod(m, struct ip6_hdr *); nip6->ip6_src = oip6->ip6_src; nip6->ip6_dst = oip6->ip6_dst; in6_clearscope(&oip6->ip6_src); in6_clearscope(&oip6->ip6_dst); icmp6 = (struct icmp6_hdr *)(nip6 + 1); icmp6->icmp6_type = type; icmp6->icmp6_code = code; icmp6->icmp6_pptr = htonl((u_int32_t)param); /* * icmp6_reflect() is designed to be in the input path. * icmp6_error() can be called from both input and output path, * and if we are in output path rcvif could contain bogus value. * clear m->m_pkthdr.rcvif for safety, we should have enough scope * information in ip header (nip6). */ m->m_pkthdr.rcvif = NULL; V_icmp6stat.icp6s_outhist[type]++; icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */ return; freeit: /* * If we can't tell whether or not we can generate ICMP6, free it. */ m_freem(m); } /* * Process a received ICMP6 message. */ int icmp6_input(struct mbuf **mp, int *offp, int proto) { INIT_VNET_INET6(curvnet); INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */ struct mbuf *m = *mp, *n; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; int off = *offp; int icmp6len = m->m_pkthdr.len - *offp; int code, sum, noff; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE); /* m might change if M_LOOP. So, call mtod after this */ #endif /* * Locate icmp6 structure in mbuf, and check * that not corrupted and of at least minimum length */ ip6 = mtod(m, struct ip6_hdr *); if (icmp6len < sizeof(struct icmp6_hdr)) { V_icmp6stat.icp6s_tooshort++; goto freeit; } /* * calculate the checksum */ #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { V_icmp6stat.icp6s_tooshort++; return IPPROTO_DONE; } #endif code = icmp6->icmp6_code; if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) { nd6log((LOG_ERR, "ICMP6 checksum error(%d|%x) %s\n", icmp6->icmp6_type, sum, ip6_sprintf(ip6bufs, &ip6->ip6_src))); V_icmp6stat.icp6s_checksum++; goto freeit; } if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) { /* * Deliver very specific ICMP6 type only. * This is important to deliver TOOBIG. Otherwise PMTUD * will not work. */ switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: case ICMP6_PACKET_TOO_BIG: case ICMP6_TIME_EXCEEDED: break; default: goto freeit; } } V_icmp6stat.icp6s_inhist[icmp6->icmp6_type]++; icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_msg); if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_error); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_dstunreach); switch (code) { case ICMP6_DST_UNREACH_NOROUTE: code = PRC_UNREACH_NET; break; case ICMP6_DST_UNREACH_ADMIN: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_adminprohib); code = PRC_UNREACH_PROTOCOL; /* is this a good code? */ break; case ICMP6_DST_UNREACH_ADDR: code = PRC_HOSTDEAD; break; case ICMP6_DST_UNREACH_BEYONDSCOPE: /* I mean "source address was incorrect." */ code = PRC_PARAMPROB; break; case ICMP6_DST_UNREACH_NOPORT: code = PRC_UNREACH_PORT; break; default: goto badcode; } goto deliver; break; case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_pkttoobig); /* validation is made in icmp6_mtudisc_update */ code = PRC_MSGSIZE; /* * Updating the path MTU will be done after examining * intermediate extension headers. */ goto deliver; break; case ICMP6_TIME_EXCEEDED: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_timeexceed); switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: code = PRC_TIMXCEED_INTRANS; break; case ICMP6_TIME_EXCEED_REASSEMBLY: code = PRC_TIMXCEED_REASS; break; default: goto badcode; } goto deliver; break; case ICMP6_PARAM_PROB: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_paramprob); switch (code) { case ICMP6_PARAMPROB_NEXTHEADER: code = PRC_UNREACH_PROTOCOL; break; case ICMP6_PARAMPROB_HEADER: case ICMP6_PARAMPROB_OPTION: code = PRC_PARAMPROB; break; default: goto badcode; } goto deliver; break; case ICMP6_ECHO_REQUEST: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echo); if (code != 0) goto badcode; if ((n = m_copy(m, 0, M_COPYALL)) == NULL) { /* Give up remote */ break; } if ((n->m_flags & M_EXT) != 0 || n->m_len < off + sizeof(struct icmp6_hdr)) { struct mbuf *n0 = n; const int maxlen = sizeof(*nip6) + sizeof(*nicmp6); int n0len; MGETHDR(n, M_DONTWAIT, n0->m_type); n0len = n0->m_pkthdr.len; /* save for use below */ if (n) M_MOVE_PKTHDR(n, n0); if (n && maxlen >= MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (n == NULL) { /* Give up remote */ m_freem(n0); break; } /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); noff = sizeof(struct ip6_hdr); /* new mbuf contains only ipv6+icmpv6 headers */ n->m_len = noff + sizeof(struct icmp6_hdr); /* * Adjust mbuf. ip6_plen will be adjusted in * ip6_output(). */ m_adj(n0, off + sizeof(struct icmp6_hdr)); /* recalculate complete packet size */ n->m_pkthdr.len = n0len + (noff - off); n->m_next = n0; } else { nip6 = mtod(n, struct ip6_hdr *); IP6_EXTHDR_GET(nicmp6, struct icmp6_hdr *, n, off, sizeof(*nicmp6)); noff = off; } nicmp6->icmp6_type = ICMP6_ECHO_REPLY; nicmp6->icmp6_code = 0; if (n) { V_icmp6stat.icp6s_reflect++; V_icmp6stat.icp6s_outhist[ICMP6_ECHO_REPLY]++; icmp6_reflect(n, noff); } break; case ICMP6_ECHO_REPLY: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echoreply); if (code != 0) goto badcode; break; case MLD_LISTENER_QUERY: case MLD_LISTENER_REPORT: if (icmp6len < sizeof(struct mld_hdr)) goto badlen; if (icmp6->icmp6_type == MLD_LISTENER_QUERY) /* XXX: ugly... */ icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldquery); else icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldreport); if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ mld6_input(m, off); m = NULL; goto freeit; } mld6_input(n, off); /* m stays. */ break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mlddone); if (icmp6len < sizeof(struct mld_hdr)) /* necessary? */ goto badlen; break; /* nothing to be done in kernel */ case MLD_MTRACE_RESP: case MLD_MTRACE: /* XXX: these two are experimental. not officially defined. */ /* XXX: per-interface statistics? */ break; /* just pass it to applications */ case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */ { enum { WRU, FQDN } mode; if (!V_icmp6_nodeinfo) break; if (icmp6len == sizeof(struct icmp6_hdr) + 4) mode = WRU; else if (icmp6len >= sizeof(struct icmp6_nodeinfo)) mode = FQDN; else goto badlen; #define hostnamelen strlen(V_hostname) if (mode == FQDN) { #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo), IPPROTO_DONE); #endif n = m_copy(m, 0, M_COPYALL); if (n) n = ni6_input(n, off); /* XXX meaningless if n == NULL */ noff = sizeof(struct ip6_hdr); } else { u_char *p; int maxlen, maxhlen; /* * XXX: this combination of flags is pointless, * but should we keep this for compatibility? */ if ((V_icmp6_nodeinfo & 5) != 5) break; if (code != 0) goto badcode; maxlen = sizeof(*nip6) + sizeof(*nicmp6) + 4; if (maxlen >= MCLBYTES) { /* Give up remote */ break; } MGETHDR(n, M_DONTWAIT, m->m_type); if (n && maxlen > MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (n && !m_dup_pkthdr(n, m, M_DONTWAIT)) { /* * Previous code did a blind M_COPY_PKTHDR * and said "just for rcvif". If true, then * we could tolerate the dup failing (due to * the deep copy of the tag chain). For now * be conservative and just fail. */ m_free(n); n = NULL; } if (n == NULL) { /* Give up remote */ break; } n->m_pkthdr.rcvif = NULL; n->m_len = 0; maxhlen = M_TRAILINGSPACE(n) - maxlen; mtx_lock(&hostname_mtx); if (maxhlen > hostnamelen) maxhlen = hostnamelen; /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); bzero(p, 4); bcopy(V_hostname, p + 4, maxhlen); /* meaningless TTL */ mtx_unlock(&hostname_mtx); noff = sizeof(struct ip6_hdr); n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + 4 + maxhlen; nicmp6->icmp6_type = ICMP6_WRUREPLY; nicmp6->icmp6_code = 0; } #undef hostnamelen if (n) { V_icmp6stat.icp6s_reflect++; V_icmp6stat.icp6s_outhist[ICMP6_WRUREPLY]++; icmp6_reflect(n, noff); } break; } case ICMP6_WRUREPLY: if (code != 0) goto badcode; break; case ND_ROUTER_SOLICIT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routersolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_solicit)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_rs_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_rs_input(n, off, icmp6len); /* m stays. */ break; case ND_ROUTER_ADVERT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routeradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_advert)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_ra_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_ra_input(n, off, icmp6len); /* m stays. */ break; case ND_NEIGHBOR_SOLICIT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighborsolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_solicit)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_ns_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_ns_input(n, off, icmp6len); /* m stays. */ break; case ND_NEIGHBOR_ADVERT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighboradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_advert)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_na_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_na_input(n, off, icmp6len); /* m stays. */ break; case ND_REDIRECT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_redirect); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_redirect)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ icmp6_redirect_input(m, off); m = NULL; goto freeit; } icmp6_redirect_input(n, off); /* m stays. */ break; case ICMP6_ROUTER_RENUMBERING: if (code != ICMP6_ROUTER_RENUMBERING_COMMAND && code != ICMP6_ROUTER_RENUMBERING_RESULT) goto badcode; if (icmp6len < sizeof(struct icmp6_router_renum)) goto badlen; break; default: nd6log((LOG_DEBUG, "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n", icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), m->m_pkthdr.rcvif ? m->m_pkthdr.rcvif->if_index : 0)); if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) { /* ICMPv6 error: MUST deliver it by spec... */ code = PRC_NCMDS; /* deliver */ } else { /* ICMPv6 informational: MUST not deliver */ break; } deliver: if (icmp6_notify_error(&m, off, icmp6len, code)) { /* In this case, m should've been freed. */ return (IPPROTO_DONE); } break; badcode: V_icmp6stat.icp6s_badcode++; break; badlen: V_icmp6stat.icp6s_badlen++; break; } /* deliver the packet to appropriate sockets */ icmp6_rip6_input(&m, *offp); return IPPROTO_DONE; freeit: m_freem(m); return IPPROTO_DONE; } static int icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) { INIT_VNET_INET6(curvnet); struct mbuf *m = *mp; struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; u_int32_t notifymtu; struct sockaddr_in6 icmp6src, icmp6dst; if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) { V_icmp6stat.icp6s_tooshort++; goto freeit; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr), -1); icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { V_icmp6stat.icp6s_tooshort++; return (-1); } #endif eip6 = (struct ip6_hdr *)(icmp6 + 1); /* Detect the upper level protocol */ { void (*ctlfunc)(int, struct sockaddr *, void *); u_int8_t nxt = eip6->ip6_nxt; int eoff = off + sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr); struct ip6ctlparam ip6cp; struct in6_addr *finaldst = NULL; int icmp6type = icmp6->icmp6_type; struct ip6_frag *fh; struct ip6_rthdr *rth; struct ip6_rthdr0 *rth0; int rthlen; while (1) { /* XXX: should avoid infinite loop explicitly? */ struct ip6_ext *eh; switch (nxt) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_AH: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_ext), -1); eh = (struct ip6_ext *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(eh, struct ip6_ext *, m, eoff, sizeof(*eh)); if (eh == NULL) { V_icmp6stat.icp6s_tooshort++; return (-1); } #endif if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else eoff += (eh->ip6e_len + 1) << 3; nxt = eh->ip6e_nxt; break; case IPPROTO_ROUTING: /* * When the erroneous packet contains a * routing header, we should examine the * header to determine the final destination. * Otherwise, we can't properly update * information that depends on the final * destination (e.g. path MTU). */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth), -1); rth = (struct ip6_rthdr *) (mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m, eoff, sizeof(*rth)); if (rth == NULL) { V_icmp6stat.icp6s_tooshort++; return (-1); } #endif rthlen = (rth->ip6r_len + 1) << 3; /* * XXX: currently there is no * officially defined type other * than type-0. * Note that if the segment left field * is 0, all intermediate hops must * have been passed. */ if (rth->ip6r_segleft && rth->ip6r_type == IPV6_RTHDR_TYPE_0) { int hops; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + rthlen, -1); rth0 = (struct ip6_rthdr0 *) (mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth0, struct ip6_rthdr0 *, m, eoff, rthlen); if (rth0 == NULL) { V_icmp6stat.icp6s_tooshort++; return (-1); } #endif /* just ignore a bogus header */ if ((rth0->ip6r0_len % 2) == 0 && (hops = rth0->ip6r0_len/2)) finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1); } eoff += rthlen; nxt = rth->ip6r_nxt; break; case IPPROTO_FRAGMENT: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_frag), -1); fh = (struct ip6_frag *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(fh, struct ip6_frag *, m, eoff, sizeof(*fh)); if (fh == NULL) { V_icmp6stat.icp6s_tooshort++; return (-1); } #endif /* * Data after a fragment header is meaningless * unless it is the first fragment, but * we'll go to the notify label for path MTU * discovery. */ if (fh->ip6f_offlg & IP6F_OFF_MASK) goto notify; eoff += sizeof(struct ip6_frag); nxt = fh->ip6f_nxt; break; default: /* * This case includes ESP and the No Next * Header. In such cases going to the notify * label does not have any meaning * (i.e. ctlfunc will be NULL), but we go * anyway since we might have to update * path MTU information. */ goto notify; } } notify: #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { V_icmp6stat.icp6s_tooshort++; return (-1); } #endif /* * retrieve parameters from the inner IPv6 header, and convert * them into sockaddr structures. * XXX: there is no guarantee that the source or destination * addresses of the inner packet are in the same scope as * the addresses of the icmp packet. But there is no other * way to determine the zone. */ eip6 = (struct ip6_hdr *)(icmp6 + 1); bzero(&icmp6dst, sizeof(icmp6dst)); icmp6dst.sin6_len = sizeof(struct sockaddr_in6); icmp6dst.sin6_family = AF_INET6; if (finaldst == NULL) icmp6dst.sin6_addr = eip6->ip6_dst; else icmp6dst.sin6_addr = *finaldst; if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; bzero(&icmp6src, sizeof(icmp6src)); icmp6src.sin6_len = sizeof(struct sockaddr_in6); icmp6src.sin6_family = AF_INET6; icmp6src.sin6_addr = eip6->ip6_src; if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; icmp6src.sin6_flowinfo = (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); if (finaldst == NULL) finaldst = &eip6->ip6_dst; ip6cp.ip6c_m = m; ip6cp.ip6c_icmp6 = icmp6; ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1); ip6cp.ip6c_off = eoff; ip6cp.ip6c_finaldst = finaldst; ip6cp.ip6c_src = &icmp6src; ip6cp.ip6c_nxt = nxt; if (icmp6type == ICMP6_PACKET_TOO_BIG) { notifymtu = ntohl(icmp6->icmp6_mtu); ip6cp.ip6c_cmdarg = (void *)¬ifymtu; icmp6_mtudisc_update(&ip6cp, 1); /*XXX*/ } ctlfunc = (void (*)(int, struct sockaddr *, void *)) (inet6sw[ip6_protox[nxt]].pr_ctlinput); if (ctlfunc) { (void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst, &ip6cp); } } *mp = m; return (0); freeit: m_freem(m); return (-1); } void icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated) { INIT_VNET_INET6(curvnet); struct in6_addr *dst = ip6cp->ip6c_finaldst; struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */ u_int mtu = ntohl(icmp6->icmp6_mtu); struct in_conninfo inc; #if 0 /* * RFC2460 section 5, last paragraph. * even though minimum link MTU for IPv6 is IPV6_MMTU, * we may see ICMPv6 too big with mtu < IPV6_MMTU * due to packet translator in the middle. * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for * special handling. */ if (mtu < IPV6_MMTU) return; #endif /* * we reject ICMPv6 too big with abnormally small value. * XXX what is the good definition of "abnormally small"? */ if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8) return; if (!validated) return; /* * In case the suggested mtu is less than IPV6_MMTU, we * only need to remember that it was for above mentioned * "alwaysfrag" case. * Try to be as close to the spec as possible. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; bzero(&inc, sizeof(inc)); inc.inc_flags = 1; /* IPv6 */ inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) return; if (mtu < tcp_maxmtu6(&inc, NULL)) { tcp_hc_updatemtu(&inc, mtu); V_icmp6stat.icp6s_pmtuchg++; } } /* * Process a Node Information Query packet, based on * draft-ietf-ipngwg-icmp-name-lookups-07. * * Spec incompatibilities: * - IPv6 Subject address handling * - IPv4 Subject address handling support missing * - Proxy reply (answer even if it's not for me) * - joins NI group address at in6_ifattach() time only, does not cope * with hostname changes by sethostname(3) */ #define hostnamelen strlen(V_hostname) static struct mbuf * ni6_input(struct mbuf *m, int off) { INIT_VNET_INET6(curvnet); INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */ struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; u_int16_t qtype; int subjlen; int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); struct ni_reply_fqdn *fqdn; int addrs; /* for NI_QTYPE_NODEADDR */ struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */ struct in6_addr in6_subj; /* subject address */ struct ip6_hdr *ip6; int oldfqdn = 0; /* if 1, return pascal string (03 draft) */ char *subj = NULL; struct in6_ifaddr *ia6 = NULL; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6)); if (ni6 == NULL) { /* m is already reclaimed */ return (NULL); } #endif /* * Validate IPv6 source address. * The default configuration MUST be to refuse answering queries from * global-scope addresses according to RFC4602. * Notes: * - it's not very clear what "refuse" means; this implementation * simply drops it. * - it's not very easy to identify global-scope (unicast) addresses * since there are many prefixes for them. It should be safer * and in practice sufficient to check "all" but loopback and * link-local (note that site-local unicast was deprecated and * ULA is defined as global scope-wise) */ if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 && !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) && !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) goto bad; /* * Validate IPv6 destination address. * * The Responder must discard the Query without further processing * unless it is one of the Responder's unicast or anycast addresses, or * a link-local scope multicast address which the Responder has joined. * [RFC4602, Section 5.] */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) goto bad; /* else it's a link-local multicast, fine */ } else { /* unicast or anycast */ if ((ia6 = ip6_getdstifaddr(m)) == NULL) goto bad; /* XXX impossible */ if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) && !(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) { nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); goto bad; } } /* validate query Subject field. */ qtype = ntohs(ni6->ni_qtype); subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo); switch (qtype) { case NI_QTYPE_NOOP: case NI_QTYPE_SUPTYPES: /* 07 draft */ if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0) break; /* FALLTHROUGH */ case NI_QTYPE_FQDN: case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: #if ICMP6_NI_SUBJ_IPV6 != 0 case 0: #endif /* * backward compatibility - try to accept 03 draft * format, where no Subject is present. */ if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 && subjlen == 0) { oldfqdn++; break; } #if ICMP6_NI_SUBJ_IPV6 != 0 if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6) goto bad; #endif if (subjlen != sizeof(struct in6_addr)) goto bad; /* * Validate Subject address. * * Not sure what exactly "address belongs to the node" * means in the spec, is it just unicast, or what? * * At this moment we consider Subject address as * "belong to the node" if the Subject address equals * to the IPv6 destination address; validation for * IPv6 destination address should have done enough * check for us. * * We do not do proxy at this moment. */ /* m_pulldown instead of copy? */ m_copydata(m, off + sizeof(struct icmp6_nodeinfo), subjlen, (caddr_t)&in6_subj); if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL)) goto bad; subj = (char *)&in6_subj; if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj)) break; /* * XXX if we are to allow other cases, we should really * be careful about scope here. * basically, we should disallow queries toward IPv6 * destination X with subject Y, * if scope(X) > scope(Y). * if we allow scope(X) > scope(Y), it will result in * information leakage across scope boundary. */ goto bad; case ICMP6_NI_SUBJ_FQDN: /* * Validate Subject name with gethostname(3). * * The behavior may need some debate, since: * - we are not sure if the node has FQDN as * hostname (returned by gethostname(3)). * - the code does wildcard match for truncated names. * however, we are not sure if we want to perform * wildcard match, if gethostname(3) side has * truncated hostname. */ mtx_lock(&hostname_mtx); n = ni6_nametodns(V_hostname, hostnamelen, 0); mtx_unlock(&hostname_mtx); if (!n || n->m_next || n->m_len == 0) goto bad; IP6_EXTHDR_GET(subj, char *, m, off + sizeof(struct icmp6_nodeinfo), subjlen); if (subj == NULL) goto bad; if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *), n->m_len)) { goto bad; } m_freem(n); n = NULL; break; case ICMP6_NI_SUBJ_IPV4: /* XXX: to be implemented? */ default: goto bad; } break; } /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0) goto bad; break; case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0) goto bad; break; } /* guess reply length */ switch (qtype) { case NI_QTYPE_NOOP: break; /* no reply data */ case NI_QTYPE_SUPTYPES: replylen += sizeof(u_int32_t); break; case NI_QTYPE_FQDN: /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); break; case NI_QTYPE_NODEADDR: addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj); if ((replylen += addrs * (sizeof(struct in6_addr) + sizeof(u_int32_t))) > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ break; case NI_QTYPE_IPV4ADDR: /* unsupported - should respond with unknown Qtype? */ break; default: /* * XXX: We must return a reply with the ICMP6 code * `unknown Qtype' in this case. However we regard the case * as an FQDN query for backward compatibility. * Older versions set a random value to this field, * so it rarely varies in the defined qtypes. * But the mechanism is not reliable... * maybe we should obsolete older versions. */ qtype = NI_QTYPE_FQDN; /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); oldfqdn++; break; } /* allocate an mbuf to reply. */ MGETHDR(n, M_DONTWAIT, m->m_type); if (n == NULL) { m_freem(m); return (NULL); } M_MOVE_PKTHDR(n, m); /* just for recvif */ if (replylen > MHLEN) { if (replylen > MCLBYTES) { /* * XXX: should we try to allocate more? But MCLBYTES * is probably much larger than IPV6_MMTU... */ goto bad; } MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { goto bad; } } n->m_pkthdr.len = n->m_len = replylen; /* copy mbuf header and IPv6 + Node Information base headers */ bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr)); nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1); bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo)); /* qtype dependent procedure */ switch (qtype) { case NI_QTYPE_NOOP: nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = 0; break; case NI_QTYPE_SUPTYPES: { u_int32_t v; nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = htons(0x0000); /* raw bitmap */ /* supports NOOP, SUPTYPES, FQDN, and NODEADDR */ v = (u_int32_t)htonl(0x0000000f); bcopy(&v, nni6 + 1, sizeof(u_int32_t)); break; } case NI_QTYPE_FQDN: nni6->ni_code = ICMP6_NI_SUCCESS; fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) + sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo)); nni6->ni_flags = 0; /* XXX: meaningless TTL */ fqdn->ni_fqdn_ttl = 0; /* ditto. */ /* * XXX do we really have FQDN in variable "hostname"? */ mtx_lock(&hostname_mtx); n->m_next = ni6_nametodns(V_hostname, hostnamelen, oldfqdn); mtx_unlock(&hostname_mtx); if (n->m_next == NULL) goto bad; /* XXX we assume that n->m_next is not a chain */ if (n->m_next->m_next != NULL) goto bad; n->m_pkthdr.len += n->m_next->m_len; break; case NI_QTYPE_NODEADDR: { int lenlim, copied; nni6->ni_code = ICMP6_NI_SUCCESS; n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); lenlim = M_TRAILINGSPACE(n); copied = ni6_store_addrs(ni6, nni6, ifp, lenlim); /* XXX: reset mbuf length */ n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo) + copied; break; } default: break; /* XXX impossible! */ } nni6->ni_type = ICMP6_NI_REPLY; m_freem(m); return (n); bad: m_freem(m); if (n) m_freem(n); return (NULL); } #undef hostnamelen /* * make a mbuf with DNS-encoded string. no compression support. * * XXX names with less than 2 dots (like "foo" or "foo.section") will be * treated as truncated name (two \0 at the end). this is a wild guess. * * old - return pascal string if non-zero */ static struct mbuf * ni6_nametodns(const char *name, int namelen, int old) { struct mbuf *m; char *cp, *ep; const char *p, *q; int i, len, nterm; if (old) len = namelen + 1; else len = MCLBYTES; /* because MAXHOSTNAMELEN is usually 256, we use cluster mbuf */ MGET(m, M_DONTWAIT, MT_DATA); if (m && len > MLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) goto fail; } if (!m) goto fail; m->m_next = NULL; if (old) { m->m_len = len; *mtod(m, char *) = namelen; bcopy(name, mtod(m, char *) + 1, namelen); return m; } else { m->m_len = 0; cp = mtod(m, char *); ep = mtod(m, char *) + M_TRAILINGSPACE(m); /* if not certain about my name, return empty buffer */ if (namelen == 0) return m; /* * guess if it looks like shortened hostname, or FQDN. * shortened hostname needs two trailing "\0". */ i = 0; for (p = name; p < name + namelen; p++) { if (*p && *p == '.') i++; } if (i < 2) nterm = 2; else nterm = 1; p = name; while (cp < ep && p < name + namelen) { i = 0; for (q = p; q < name + namelen && *q && *q != '.'; q++) i++; /* result does not fit into mbuf */ if (cp + i + 1 >= ep) goto fail; /* * DNS label length restriction, RFC1035 page 8. * "i == 0" case is included here to avoid returning * 0-length label on "foo..bar". */ if (i <= 0 || i >= 64) goto fail; *cp++ = i; bcopy(p, cp, i); cp += i; p = q; if (p < name + namelen && *p == '.') p++; } /* termination */ if (cp + nterm >= ep) goto fail; while (nterm-- > 0) *cp++ = '\0'; m->m_len = cp - mtod(m, char *); return m; } panic("should not reach here"); /* NOTREACHED */ fail: if (m) m_freem(m); return NULL; } /* * check if two DNS-encoded string matches. takes care of truncated * form (with \0\0 at the end). no compression support. * XXX upper/lowercase match (see RFC2065) */ static int ni6_dnsmatch(const char *a, int alen, const char *b, int blen) { const char *a0, *b0; int l; /* simplest case - need validation? */ if (alen == blen && bcmp(a, b, alen) == 0) return 1; a0 = a; b0 = b; /* termination is mandatory */ if (alen < 2 || blen < 2) return 0; if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0') return 0; alen--; blen--; while (a - a0 < alen && b - b0 < blen) { if (a - a0 + 1 > alen || b - b0 + 1 > blen) return 0; if ((signed char)a[0] < 0 || (signed char)b[0] < 0) return 0; /* we don't support compression yet */ if (a[0] >= 64 || b[0] >= 64) return 0; /* truncated case */ if (a[0] == 0 && a - a0 == alen - 1) return 1; if (b[0] == 0 && b - b0 == blen - 1) return 1; if (a[0] == 0 || b[0] == 0) return 0; if (a[0] != b[0]) return 0; l = a[0]; if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen) return 0; if (bcmp(a + 1, b + 1, l) != 0) return 0; a += 1 + l; b += 1 + l; } if (a - a0 == alen && b - b0 == blen) return 1; else return 0; } /* * calculate the number of addresses to be returned in the node info reply. */ static int ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp, struct in6_addr *subj) { INIT_VNET_NET(curvnet); INIT_VNET_INET6(curvnet); struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; int addrs = 0, addrsofif, iffound = 0; int niflags = ni6->ni_flags; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) { switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: if (subj == NULL) /* must be impossible... */ return (0); break; default: /* * XXX: we only support IPv6 subject address for * this Qtype. */ return (0); } } IFNET_RLOCK(); for (ifp = TAILQ_FIRST(&V_ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { addrsofif = 0; TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 && IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr)) iffound = 1; /* * IPv4-mapped addresses can only be returned by a * Node Information proxy, since they represent * addresses of IPv4-only nodes, which perforce do * not implement this protocol. * [icmp-name-lookups-07, Section 5.4] * So we don't support NI_NODEADDR_FLAG_COMPAT in * this function at this moment. */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; /* we need only unicast addresses */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } addrsofif++; /* count the address */ } if (iffound) { *ifpp = ifp; IFNET_RUNLOCK(); return (addrsofif); } addrs += addrsofif; } IFNET_RUNLOCK(); return (addrs); } static int ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, struct ifnet *ifp0, int resid) { INIT_VNET_NET(curvnet); INIT_VNET_INET6(curvnet); struct ifnet *ifp = ifp0 ? ifp0 : TAILQ_FIRST(&V_ifnet); struct in6_ifaddr *ifa6; struct ifaddr *ifa; struct ifnet *ifp_dep = NULL; int copied = 0, allow_deprecated = 0; u_char *cp = (u_char *)(nni6 + 1); int niflags = ni6->ni_flags; u_int32_t ltime; if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) return (0); /* needless to copy */ IFNET_RLOCK(); again: for (; ifp; ifp = TAILQ_NEXT(ifp, if_list)) { for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 && allow_deprecated == 0) { /* * prefererred address should be put before * deprecated addresses. */ /* record the interface for later search */ if (ifp_dep == NULL) ifp_dep = ifp; continue; } else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && allow_deprecated != 0) continue; /* we now collect deprecated addrs */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { /* * We give up much more copy. * Set the truncate flag and return. */ nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE; IFNET_RUNLOCK(); return (copied); } /* * Set the TTL of the address. * The TTL value should be one of the following * according to the specification: * * 1. The remaining lifetime of a DHCP lease on the * address, or * 2. The remaining Valid Lifetime of a prefix from * which the address was derived through Stateless * Autoconfiguration. * * Note that we currently do not support stateful * address configuration by DHCPv6, so the former * case can't happen. */ if (ifa6->ia6_lifetime.ia6t_expire == 0) ltime = ND6_INFINITE_LIFETIME; else { if (ifa6->ia6_lifetime.ia6t_expire > time_second) ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_second); else ltime = 0; } bcopy(<ime, cp, sizeof(u_int32_t)); cp += sizeof(u_int32_t); /* copy the address itself */ bcopy(&ifa6->ia_addr.sin6_addr, cp, sizeof(struct in6_addr)); in6_clearscope((struct in6_addr *)cp); /* XXX */ cp += sizeof(struct in6_addr); resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); } if (ifp0) /* we need search only on the specified IF */ break; } if (allow_deprecated == 0 && ifp_dep != NULL) { ifp = ifp_dep; allow_deprecated = 1; goto again; } IFNET_RUNLOCK(); return (copied); } /* * XXX almost dup'ed code with rip6_input. */ static int icmp6_rip6_input(struct mbuf **mp, int off) { INIT_VNET_INET(curvnet); INIT_VNET_INET6(curvnet); struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct in6pcb *in6p; struct in6pcb *last = NULL; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; struct mbuf *opts = NULL; #ifndef PULLDOWN_TEST /* this is assumed to be safe. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { /* m is already reclaimed */ return (IPPROTO_DONE); } #endif /* * XXX: the address may have embedded scope zone ID, which should be * hidden from applications. */ bzero(&fromsa, sizeof(fromsa)); fromsa.sin6_family = AF_INET6; fromsa.sin6_len = sizeof(struct sockaddr_in6); fromsa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&fromsa)) { m_freem(m); return (IPPROTO_DONE); } INP_INFO_RLOCK(&V_ripcbinfo); LIST_FOREACH(in6p, &V_ripcb, inp_list) { if ((in6p->inp_vflag & INP_IPV6) == 0) continue; if (in6p->in6p_ip6_nxt != IPPROTO_ICMPV6) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) continue; INP_RLOCK(in6p); if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, in6p->in6p_icmp6filt)) { INP_RUNLOCK(in6p); continue; } if (last) { struct mbuf *n = NULL; /* * Recent network drivers tend to allocate a single * mbuf cluster, rather than to make a couple of * mbufs without clusters. Also, since the IPv6 code * path tries to avoid m_pullup(), it is highly * probable that we still have an mbuf cluster here * even though the necessary length can be stored in an * mbuf's internal buffer. * Meanwhile, the default size of the receive socket * buffer for raw sockets is not so large. This means * the possibility of packet loss is relatively higher * than before. To avoid this scenario, we copy the * received data to a separate mbuf that does not use * a cluster, if possible. * XXX: it is better to copy the data after stripping * intermediate headers. */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { MGET(n, M_DONTWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; } else { m_free(n); n = NULL; } } } if (n != NULL || (n = m_copy(m, 0, (int)M_COPYALL)) != NULL) { if (last->in6p_flags & IN6P_CONTROLOPTS) ip6_savecontrol(last, n, &opts); /* strip intermediate headers */ m_adj(n, off); SOCKBUF_LOCK(&last->in6p_socket->so_rcv); if (sbappendaddr_locked( &last->in6p_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) { m_freem(opts); } SOCKBUF_UNLOCK( &last->in6p_socket->so_rcv); } else sorwakeup_locked(last->in6p_socket); opts = NULL; } INP_RUNLOCK(last); } last = in6p; } INP_INFO_RUNLOCK(&V_ripcbinfo); if (last) { if (last->in6p_flags & IN6P_CONTROLOPTS) ip6_savecontrol(last, m, &opts); /* strip intermediate headers */ m_adj(m, off); /* avoid using mbuf clusters if possible (see above) */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { struct mbuf *n; MGET(n, M_DONTWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; m_freem(m); m = n; } else { m_freem(n); n = NULL; } } } SOCKBUF_LOCK(&last->in6p_socket->so_rcv); if (sbappendaddr_locked(&last->in6p_socket->so_rcv, (struct sockaddr *)&fromsa, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&last->in6p_socket->so_rcv); } else sorwakeup_locked(last->in6p_socket); INP_RUNLOCK(last); } else { m_freem(m); V_ip6stat.ip6s_delivered--; } return IPPROTO_DONE; } /* * Reflect the ip6 packet back to the source. * OFF points to the icmp6 header, counted from the top of the mbuf. */ void icmp6_reflect(struct mbuf *m, size_t off) { INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; struct in6_ifaddr *ia; int plen; int type, code; struct ifnet *outif = NULL; struct in6_addr origdst, *src = NULL; /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { nd6log((LOG_DEBUG, "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n", (u_long)off, (u_long)sizeof(struct ip6_hdr), __FILE__, __LINE__)); goto bad; } /* * If there are extra headers between IPv6 and ICMPv6, strip * off that header first. */ #ifdef DIAGNOSTIC if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN) panic("assumption failed in icmp6_reflect"); #endif if (off > sizeof(struct ip6_hdr)) { size_t l; struct ip6_hdr nip6; l = off - sizeof(struct ip6_hdr); m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6); m_adj(m, l); l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6)); } else /* off == sizeof(struct ip6_hdr) */ { size_t l; l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } } plen = m->m_pkthdr.len - sizeof(struct ip6_hdr); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_nxt = IPPROTO_ICMPV6; icmp6 = (struct icmp6_hdr *)(ip6 + 1); type = icmp6->icmp6_type; /* keep type for statistics */ code = icmp6->icmp6_code; /* ditto. */ origdst = ip6->ip6_dst; /* * ip6_input() drops a packet if its src is multicast. * So, the src is never multicast. */ ip6->ip6_dst = ip6->ip6_src; /* * If the incoming packet was addressed directly to us (i.e. unicast), * use dst as the src for the reply. * The IN6_IFF_NOTREADY case should be VERY rare, but is possible * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. * Note that ip6_getdstifaddr() may fail if we are in an error handling * procedure of an outgoing packet of our own, in which case we need * to search in the ifaddr list. */ if (!IN6_IS_ADDR_MULTICAST(&origdst)) { if ((ia = ip6_getdstifaddr(m))) { if (!(ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) src = &ia->ia_addr.sin6_addr; } else { struct sockaddr_in6 d; bzero(&d, sizeof(d)); d.sin6_family = AF_INET6; d.sin6_len = sizeof(d); d.sin6_addr = origdst; ia = (struct in6_ifaddr *) ifa_ifwithaddr((struct sockaddr *)&d); if (ia && !(ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) { src = &ia->ia_addr.sin6_addr; } } } if (src == NULL) { int e; struct sockaddr_in6 sin6; struct route_in6 ro; /* * This case matches to multicasts, our anycast, or unicasts * that we do not own. Select a source address based on the * source address of the erroneous packet. */ bzero(&sin6, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = ip6->ip6_dst; /* zone ID should be embedded */ bzero(&ro, sizeof(ro)); src = in6_selectsrc(&sin6, NULL, NULL, &ro, NULL, &outif, &e); if (ro.ro_rt) RTFREE(ro.ro_rt); /* XXX: we could use this */ if (src == NULL) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "icmp6_reflect: source can't be determined: " "dst=%s, error=%d\n", ip6_sprintf(ip6buf, &sin6.sin6_addr), e)); goto bad; } } ip6->ip6_src = *src; ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; if (outif) ip6->ip6_hlim = ND_IFINFO(outif)->chlim; else if (m->m_pkthdr.rcvif) { /* XXX: This may not be the outgoing interface */ ip6->ip6_hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim; } else ip6->ip6_hlim = V_ip6_defhlim; icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), plen); /* * XXX option handling */ m->m_flags &= ~(M_BCAST|M_MCAST); ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) icmp6_ifoutstat_inc(outif, type, code); return; bad: m_freem(m); return; } void icmp6_fasttimo(void) { return; } static const char * icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6, struct in6_addr *tgt6) { static char buf[1024]; char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; char ip6buft[INET6_ADDRSTRLEN]; snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)", ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6), ip6_sprintf(ip6buft, tgt6)); return buf; } void icmp6_redirect_input(struct mbuf *m, int off) { INIT_VNET_INET6(curvnet); struct ifnet *ifp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_redirect *nd_rd; int icmp6len = ntohs(ip6->ip6_plen); char *lladdr = NULL; int lladdrlen = 0; u_char *redirhdr = NULL; int redirhdrlen = 0; struct rtentry *rt = NULL; int is_router; int is_onlink; struct in6_addr src6 = ip6->ip6_src; struct in6_addr redtgt6; struct in6_addr reddst6; union nd_opts ndopts; char ip6buf[INET6_ADDRSTRLEN]; if (!m) return; ifp = m->m_pkthdr.rcvif; if (!ifp) return; /* XXX if we are router, we don't update route by icmp6 redirect */ if (V_ip6_forwarding) goto freeit; if (!V_icmp6_rediraccept) goto freeit; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len); if (nd_rd == NULL) { V_icmp6stat.icp6s_tooshort++; return; } #endif redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; if (in6_setscope(&redtgt6, m->m_pkthdr.rcvif, NULL) || in6_setscope(&reddst6, m->m_pkthdr.rcvif, NULL)) { goto freeit; } /* validation */ if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "must be from linklocal\n", ip6_sprintf(ip6buf, &src6))); goto bad; } if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "hlim=%d (must be 255)\n", ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim)); goto bad; } { /* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */ struct sockaddr_in6 sin6; struct in6_addr *gw6; bzero(&sin6, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sin6.sin6_addr, sizeof(reddst6)); rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL); if (rt) { if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6) { nd6log((LOG_ERR, "ICMP6 redirect rejected; no route " "with inet6 gateway found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); RTFREE_LOCKED(rt); goto bad; } gw6 = &(((struct sockaddr_in6 *)rt->rt_gateway)->sin6_addr); if (bcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "not equal to gw-for-src=%s (must be same): " "%s\n", ip6_sprintf(ip6buf, gw6), icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); RTFREE_LOCKED(rt); goto bad; } } else { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "no route found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } RTFREE_LOCKED(rt); rt = NULL; } if (IN6_IS_ADDR_MULTICAST(&reddst6)) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "redirect dst must be unicast: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } is_router = is_onlink = 0; if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) is_router = 1; /* router case */ if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0) is_onlink = 1; /* on-link destination case */ if (!is_router && !is_onlink) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "neither router case nor onlink case: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* validation passed */ icmp6len -= sizeof(*nd_rd); nd6_option_init(nd_rd + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "icmp6_redirect_input: " "invalid ND option, rejected: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } if (ndopts.nd_opts_rh) { redirhdrlen = ndopts.nd_opts_rh->nd_opt_rh_len; redirhdr = (u_char *)(ndopts.nd_opts_rh + 1); /* xxx */ } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "icmp6_redirect_input: lladdrlen mismatch for %s " "(if %d, icmp6 packet %d): %s\n", ip6_sprintf(ip6buf, &redtgt6), ifp->if_addrlen, lladdrlen - 2, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* RFC 2461 8.3 */ nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT, is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER); if (!is_onlink) { /* better router case. perform rtredirect. */ /* perform rtredirect */ struct sockaddr_in6 sdst; struct sockaddr_in6 sgw; struct sockaddr_in6 ssrc; bzero(&sdst, sizeof(sdst)); bzero(&sgw, sizeof(sgw)); bzero(&ssrc, sizeof(ssrc)); sdst.sin6_family = sgw.sin6_family = ssrc.sin6_family = AF_INET6; sdst.sin6_len = sgw.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6); bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr)); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr)); rtredirect((struct sockaddr *)&sdst, (struct sockaddr *)&sgw, (struct sockaddr *)NULL, RTF_GATEWAY | RTF_HOST, (struct sockaddr *)&ssrc); } /* finally update cached route in each socket via pfctlinput */ { struct sockaddr_in6 sdst; bzero(&sdst, sizeof(sdst)); sdst.sin6_family = AF_INET6; sdst.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst); #ifdef IPSEC key_sa_routechange((struct sockaddr *)&sdst); #endif /* IPSEC */ } freeit: m_freem(m); return; bad: V_icmp6stat.icp6s_badredirect++; m_freem(m); } void icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt) { INIT_VNET_INET6(curvnet); struct ifnet *ifp; /* my outgoing interface */ struct in6_addr *ifp_ll6; struct in6_addr *router_ll6; struct ip6_hdr *sip6; /* m0 as struct ip6_hdr */ struct mbuf *m = NULL; /* newly allocated one */ struct ip6_hdr *ip6; /* m as struct ip6_hdr */ struct nd_redirect *nd_rd; size_t maxlen; u_char *p; struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; icmp6_errcount(&V_icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0); /* if we are not router, we don't send icmp6 redirect */ if (!V_ip6_forwarding) goto fail; /* sanity check */ if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp)) goto fail; /* * Address check: * the source address must identify a neighbor, and * the destination address must not be a multicast address * [RFC 2461, sec 8.2] */ sip6 = mtod(m0, struct ip6_hdr *); bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = sip6->ip6_src; if (nd6_is_addr_neighbor(&src_sa, ifp) == 0) goto fail; if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst)) goto fail; /* what should we do here? */ /* rate limit */ if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0)) goto fail; /* * Since we are going to append up to 1280 bytes (= IPV6_MMTU), * we almost always ask for an mbuf cluster for simplicity. * (MHLEN < IPV6_MMTU is almost always true) */ #if IPV6_MMTU >= MCLBYTES # error assumption failed about IPV6_MMTU and MCLBYTES #endif MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m && IPV6_MMTU >= MHLEN) MCLGET(m, M_DONTWAIT); if (!m) goto fail; m->m_pkthdr.rcvif = NULL; m->m_len = 0; maxlen = M_TRAILINGSPACE(m); maxlen = min(IPV6_MMTU, maxlen); /* just for safety */ if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) { goto fail; } { /* get ip6 linklocal address for ifp(my outgoing interface). */ struct in6_ifaddr *ia; if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) goto fail; ifp_ll6 = &ia->ia_addr.sin6_addr; } /* get ip6 linklocal address for the router. */ if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)rt->rt_gateway; router_ll6 = &sin6->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(router_ll6)) router_ll6 = (struct in6_addr *)NULL; } else router_ll6 = (struct in6_addr *)NULL; /* ip6 */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; /* ip6->ip6_src must be linklocal addr for my outgoing if. */ bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr)); /* ND Redirect */ nd_rd = (struct nd_redirect *)(ip6 + 1); nd_rd->nd_rd_type = ND_REDIRECT; nd_rd->nd_rd_code = 0; nd_rd->nd_rd_reserved = 0; if (rt->rt_flags & RTF_GATEWAY) { /* * nd_rd->nd_rd_target must be a link-local address in * better router cases. */ if (!router_ll6) goto fail; bcopy(router_ll6, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } else { /* make sure redtgt == reddst */ bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } p = (u_char *)(nd_rd + 1); if (!router_ll6) goto nolladdropt; { /* target lladdr option */ struct rtentry *rt_router = NULL; int len; struct sockaddr_dl *sdl; struct nd_opt_hdr *nd_opt; char *lladdr; rt_router = nd6_lookup(router_ll6, 0, ifp); if (!rt_router) goto nolladdropt; len = sizeof(*nd_opt) + ifp->if_addrlen; len = (len + 7) & ~7; /* round by 8 */ /* safety check */ if (len + (p - (u_char *)ip6) > maxlen) goto nolladdropt; if (!(rt_router->rt_flags & RTF_GATEWAY) && (rt_router->rt_flags & RTF_LLINFO) && (rt_router->rt_gateway->sa_family == AF_LINK) && (sdl = (struct sockaddr_dl *)rt_router->rt_gateway) && sdl->sdl_alen) { nd_opt = (struct nd_opt_hdr *)p; nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = len >> 3; lladdr = (char *)(nd_opt + 1); bcopy(LLADDR(sdl), lladdr, ifp->if_addrlen); p += len; } } nolladdropt:; m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* just to be safe */ #ifdef M_DECRYPTED /*not openbsd*/ if (m0->m_flags & M_DECRYPTED) goto noredhdropt; #endif if (p - (u_char *)ip6 > maxlen) goto noredhdropt; { /* redirected header option */ int len; struct nd_opt_rd_hdr *nd_opt_rh; /* * compute the maximum size for icmp6 redirect header option. * XXX room for auth header? */ len = maxlen - (p - (u_char *)ip6); len &= ~7; /* This is just for simplicity. */ if (m0->m_pkthdr.len != m0->m_len) { if (m0->m_next) { m_freem(m0->m_next); m0->m_next = NULL; } m0->m_pkthdr.len = m0->m_len; } /* * Redirected header option spec (RFC2461 4.6.3) talks nothing * about padding/truncate rule for the original IP packet. * From the discussion on IPv6imp in Feb 1999, * the consensus was: * - "attach as much as possible" is the goal * - pad if not aligned (original size can be guessed by * original ip6 header) * Following code adds the padding if it is simple enough, * and truncates if not. */ if (m0->m_next || m0->m_pkthdr.len != m0->m_len) panic("assumption failed in %s:%d", __FILE__, __LINE__); if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) { /* not enough room, truncate */ m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } else { /* enough room, pad or truncate */ size_t extra; extra = m0->m_pkthdr.len % 8; if (extra) { /* pad if easy enough, truncate if not */ if (8 - extra <= M_TRAILINGSPACE(m0)) { /* pad */ m0->m_len += (8 - extra); m0->m_pkthdr.len += (8 - extra); } else { /* truncate */ m0->m_pkthdr.len -= extra; m0->m_len -= extra; } } len = m0->m_pkthdr.len + sizeof(*nd_opt_rh); m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } nd_opt_rh = (struct nd_opt_rd_hdr *)p; bzero(nd_opt_rh, sizeof(*nd_opt_rh)); nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER; nd_opt_rh->nd_opt_rh_len = len >> 3; p += sizeof(*nd_opt_rh); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* connect m0 to m */ m_tag_delete_chain(m0, NULL); m0->m_flags &= ~M_PKTHDR; m->m_next = m0; m->m_pkthdr.len = m->m_len + m0->m_len; m0 = NULL; } noredhdropt:; if (m0) { m_freem(m0); m0 = NULL; } /* XXX: clear embedded link IDs in the inner header */ in6_clearscope(&sip6->ip6_src); in6_clearscope(&sip6->ip6_dst); in6_clearscope(&nd_rd->nd_rd_target); in6_clearscope(&nd_rd->nd_rd_dst); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); nd_rd->nd_rd_cksum = 0; nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), ntohs(ip6->ip6_plen)); /* send the packet to outside... */ ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); } V_icmp6stat.icp6s_outhist[ND_REDIRECT]++; return; fail: if (m) m_freem(m); if (m0) m_freem(m0); } /* * ICMPv6 socket option processing. */ int icmp6_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0; int optlen; struct inpcb *inp = sotoinpcb(so); int level, op, optname; if (sopt) { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; } else level = op = optname = optlen = 0; if (level != IPPROTO_ICMPV6) { return EINVAL; } switch (op) { case PRCO_SETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; if (optlen != sizeof(ic6f)) { error = EMSGSIZE; break; } error = sooptcopyin(sopt, &ic6f, optlen, optlen); if (error == 0) { INP_WLOCK(inp); *inp->in6p_icmp6filt = ic6f; INP_WUNLOCK(inp); } break; } default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; INP_RLOCK(inp); ic6f = *inp->in6p_icmp6filt; INP_RUNLOCK(inp); error = sooptcopyout(sopt, &ic6f, sizeof(ic6f)); break; } default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Perform rate limit check. * Returns 0 if it is okay to send the icmp6 packet. * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate * limitation. * * XXX per-destination/type check necessary? * * dst - not used at this moment * type - not used at this moment * code - not used at this moment */ static int icmp6_ratelimit(const struct in6_addr *dst, const int type, const int code) { INIT_VNET_INET6(curvnet); int ret; ret = 0; /* okay to send */ /* PPS limit */ if (!ppsratecheck(&V_icmp6errppslim_last, &V_icmp6errpps_count, V_icmp6errppslim)) { /* The packet is subject to rate limit */ ret++; } return ret; } Index: head/sys/netinet6/in6_ifattach.c =================================================================== --- head/sys/netinet6/in6_ifattach.c (revision 185087) +++ head/sys/netinet6/in6_ifattach.c (revision 185088) @@ -1,920 +1,916 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_ifattach.c,v 1.118 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -unsigned long in6_maxmtu = 0; - -#ifdef IP6_AUTO_LINKLOCAL -int ip6_auto_linklocal = IP6_AUTO_LINKLOCAL; -#else -int ip6_auto_linklocal = 1; /* enable by default */ -#endif - +#ifdef VIMAGE_GLOBALS +unsigned long in6_maxmtu; +int ip6_auto_linklocal; struct callout in6_tmpaddrtimer_ch; +#endif extern struct inpcbinfo udbinfo; extern struct inpcbinfo ripcbinfo; static int get_rand_ifid(struct ifnet *, struct in6_addr *); static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *); static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *); static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *); static int in6_ifattach_loopback(struct ifnet *); static void in6_purgemaddrs(struct ifnet *); #define EUI64_GBIT 0x01 #define EUI64_UBIT 0x02 #define EUI64_TO_IFID(in6) do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (0) #define EUI64_GROUP(in6) ((in6)->s6_addr[8] & EUI64_GBIT) #define EUI64_INDIVIDUAL(in6) (!EUI64_GROUP(in6)) #define EUI64_LOCAL(in6) ((in6)->s6_addr[8] & EUI64_UBIT) #define EUI64_UNIVERSAL(in6) (!EUI64_LOCAL(in6)) #define IFID_LOCAL(in6) (!EUI64_LOCAL(in6)) #define IFID_UNIVERSAL(in6) (!EUI64_UNIVERSAL(in6)) /* * Generate a last-resort interface identifier, when the machine has no * IEEE802/EUI64 address sources. * The goal here is to get an interface identifier that is * (1) random enough and (2) does not change across reboot. * We currently use MD5(hostname) for it. * * in6 - upper 64bits are preserved */ static int get_rand_ifid(struct ifnet *ifp, struct in6_addr *in6) { INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */ MD5_CTX ctxt; u_int8_t digest[16]; int hostnamelen; mtx_lock(&hostname_mtx); hostnamelen = strlen(V_hostname); #if 0 /* we need at least several letters as seed for ifid */ if (hostnamelen < 3) return -1; #endif /* generate 8 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, V_hostname, hostnamelen); mtx_unlock(&hostname_mtx); MD5Final(digest, &ctxt); /* assumes sizeof(digest) > sizeof(ifid) */ bcopy(digest, &in6->s6_addr[8], 8); /* make sure to set "u" bit to local, and "g" bit to individual. */ in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); return 0; } static int generate_tmp_ifid(u_int8_t *seed0, const u_int8_t *seed1, u_int8_t *ret) { INIT_VNET_INET6(curvnet); MD5_CTX ctxt; u_int8_t seed[16], digest[16], nullbuf[8]; u_int32_t val32; /* If there's no history, start with a random seed. */ bzero(nullbuf, sizeof(nullbuf)); if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) { int i; for (i = 0; i < 2; i++) { val32 = arc4random(); bcopy(&val32, seed + sizeof(val32) * i, sizeof(val32)); } } else bcopy(seed0, seed, 8); /* copy the right-most 64-bits of the given address */ /* XXX assumption on the size of IFID */ bcopy(seed1, &seed[8], 8); if (0) { /* for debugging purposes only */ int i; printf("generate_tmp_ifid: new randomized ID from: "); for (i = 0; i < 16; i++) printf("%02x", seed[i]); printf(" "); } /* generate 16 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, seed, sizeof(seed)); MD5Final(digest, &ctxt); /* * RFC 3041 3.2.1. (3) * Take the left-most 64-bits of the MD5 digest and set bit 6 (the * left-most bit is numbered 0) to zero. */ bcopy(digest, ret, 8); ret[0] &= ~EUI64_UBIT; /* * XXX: we'd like to ensure that the generated value is not zero * for simplicity. If the caclculated digest happens to be zero, * use a random non-zero value as the last resort. */ if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) { nd6log((LOG_INFO, "generate_tmp_ifid: computed MD5 value is zero.\n")); val32 = arc4random(); val32 = 1 + (val32 % (0xffffffff - 1)); } /* * RFC 3041 3.2.1. (4) * Take the rightmost 64-bits of the MD5 digest and save them in * stable storage as the history value to be used in the next * iteration of the algorithm. */ bcopy(&digest[8], seed0, 8); if (0) { /* for debugging purposes only */ int i; printf("to: "); for (i = 0; i < 16; i++) printf("%02x", digest[i]); printf("\n"); } return 0; } /* * Get interface identifier for the specified interface. * XXX assumes single sockaddr_dl (AF_LINK address) per an interface * * in6 - upper 64bits are preserved */ int in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6) { struct ifaddr *ifa; struct sockaddr_dl *sdl; u_int8_t *addr; size_t addrlen; static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static u_int8_t allone[8] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) continue; if (sdl->sdl_alen == 0) continue; goto found; } return -1; found: addr = LLADDR(sdl); addrlen = sdl->sdl_alen; /* get EUI64 */ switch (ifp->if_type) { case IFT_ETHER: case IFT_FDDI: case IFT_ISO88025: case IFT_ATM: case IFT_IEEE1394: #ifdef IFT_IEEE80211 case IFT_IEEE80211: #endif /* IEEE802/EUI64 cases - what others? */ /* IEEE1394 uses 16byte length address starting with EUI64 */ if (addrlen > 8) addrlen = 8; /* look at IEEE802/EUI64 only */ if (addrlen != 8 && addrlen != 6) return -1; /* * check for invalid MAC address - on bsdi, we see it a lot * since wildboar configures all-zero MAC on pccard before * card insertion. */ if (bcmp(addr, allzero, addrlen) == 0) return -1; if (bcmp(addr, allone, addrlen) == 0) return -1; /* make EUI64 address */ if (addrlen == 8) bcopy(addr, &in6->s6_addr[8], 8); else if (addrlen == 6) { in6->s6_addr[8] = addr[0]; in6->s6_addr[9] = addr[1]; in6->s6_addr[10] = addr[2]; in6->s6_addr[11] = 0xff; in6->s6_addr[12] = 0xfe; in6->s6_addr[13] = addr[3]; in6->s6_addr[14] = addr[4]; in6->s6_addr[15] = addr[5]; } break; case IFT_ARCNET: if (addrlen != 1) return -1; if (!addr[0]) return -1; bzero(&in6->s6_addr[8], 8); in6->s6_addr[15] = addr[0]; /* * due to insufficient bitwidth, we mark it local. */ in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ break; case IFT_GIF: #ifdef IFT_STF case IFT_STF: #endif /* * RFC2893 says: "SHOULD use IPv4 address as ifid source". * however, IPv4 address is not very suitable as unique * identifier source (can be renumbered). * we don't do this. */ return -1; default: return -1; } /* sanity check: g bit must not indicate "group" */ if (EUI64_GROUP(in6)) return -1; /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); /* * sanity check: ifid must not be all zero, avoid conflict with * subnet router anycast */ if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 && bcmp(&in6->s6_addr[9], allzero, 7) == 0) { return -1; } return 0; } /* * Get interface identifier for the specified interface. If it is not * available on ifp0, borrow interface identifier from other information * sources. * * altifp - secondary EUI64 source */ static int get_ifid(struct ifnet *ifp0, struct ifnet *altifp, struct in6_addr *in6) { INIT_VNET_NET(ifp0->if_vnet); INIT_VNET_INET6(ifp0->if_vnet); struct ifnet *ifp; /* first, try to get it from the interface itself */ if (in6_get_hw_ifid(ifp0, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n", if_name(ifp0))); goto success; } /* try secondary EUI64 source. this basically is for ATM PVC */ if (altifp && in6_get_hw_ifid(altifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n", if_name(ifp0), if_name(altifp))); goto success; } /* next, try to get it from some other hardware interface */ IFNET_RLOCK(); for (ifp = V_ifnet.tqh_first; ifp; ifp = ifp->if_list.tqe_next) { if (ifp == ifp0) continue; if (in6_get_hw_ifid(ifp, in6) != 0) continue; /* * to borrow ifid from other interface, ifid needs to be * globally unique */ if (IFID_UNIVERSAL(in6)) { nd6log((LOG_DEBUG, "%s: borrow interface identifier from %s\n", if_name(ifp0), if_name(ifp))); IFNET_RUNLOCK(); goto success; } } IFNET_RUNLOCK(); /* last resort: get from random number source */ if (get_rand_ifid(ifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: interface identifier generated by random number\n", if_name(ifp0))); goto success; } printf("%s: failed to get interface identifier\n", if_name(ifp0)); return -1; success: nd6log((LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10], in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13], in6->s6_addr[14], in6->s6_addr[15])); return 0; } /* * altifp - secondary EUI64 source */ static int in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp) { INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia; struct in6_aliasreq ifra; struct nd_prefixctl pr0; int i, error; /* * configure link-local address. */ bzero(&ifra, sizeof(ifra)); /* * in6_update_ifa() does not use ifra_name, but we accurately set it * for safety. */ strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name)); ifra.ifra_addr.sin6_family = AF_INET6; ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000); ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0; if ((ifp->if_flags & IFF_LOOPBACK) != 0) { ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0; ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1); } else { if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) { nd6log((LOG_ERR, "%s: no ifid available\n", if_name(ifp))); return (-1); } } if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL)) return (-1); ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_prefixmask.sin6_family = AF_INET6; ifra.ifra_prefixmask.sin6_addr = in6mask64; /* link-local addresses should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* * Now call in6_update_ifa() to do a bunch of procedures to configure * a link-local address. We can set the 3rd argument to NULL, because * we know there's no other link-local address on the interface * and therefore we are adding one (instead of updating one). */ if ((error = in6_update_ifa(ifp, &ifra, NULL, IN6_IFAUPDATE_DADDELAY)) != 0) { /* * XXX: When the interface does not support IPv6, this call * would fail in the SIOCSIFADDR ioctl. I believe the * notification is rather confusing in this case, so just * suppress it. (jinmei@kame.net 20010130) */ if (error != EAFNOSUPPORT) nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to " "configure a link-local address on %s " "(errno=%d)\n", if_name(ifp), error)); return (-1); } ia = in6ifa_ifpforlinklocal(ifp, 0); /* ia must not be NULL */ #ifdef DIAGNOSTIC if (!ia) { panic("ia == NULL in in6_ifattach_linklocal"); /* NOTREACHED */ } #endif /* * Make the link-local prefix (fe80::%link/64) as on-link. * Since we'd like to manage prefixes separately from addresses, * we make an ND6 prefix structure for the link-local prefix, * and add it to the prefix list as a never-expire prefix. * XXX: this change might affect some existing code base... */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; /* this should be 64 at this moment. */ pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL); pr0.ndpr_prefix = ifra.ifra_addr; /* apply the mask for safety. (nd6_prelist_add will apply it again) */ for (i = 0; i < 4; i++) { pr0.ndpr_prefix.sin6_addr.s6_addr32[i] &= in6mask64.s6_addr32[i]; } /* * Initialize parameters. The link-local prefix must always be * on-link, and its lifetimes never expire. */ pr0.ndpr_raf_onlink = 1; pr0.ndpr_raf_auto = 1; /* probably meaningless */ pr0.ndpr_vltime = ND6_INFINITE_LIFETIME; pr0.ndpr_pltime = ND6_INFINITE_LIFETIME; /* * Since there is no other link-local addresses, nd6_prefix_lookup() * probably returns NULL. However, we cannot always expect the result. * For example, if we first remove the (only) existing link-local * address, and then reconfigure another one, the prefix is still * valid with referring to the old link-local address. */ if (nd6_prefix_lookup(&pr0) == NULL) { if ((error = nd6_prelist_add(&pr0, NULL, NULL)) != 0) return (error); } return 0; } /* * ifp - must be IFT_LOOP */ static int in6_ifattach_loopback(struct ifnet *ifp) { INIT_VNET_INET6(curvnet); struct in6_aliasreq ifra; int error; bzero(&ifra, sizeof(ifra)); /* * in6_update_ifa() does not use ifra_name, but we accurately set it * for safety. */ strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name)); ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_prefixmask.sin6_family = AF_INET6; ifra.ifra_prefixmask.sin6_addr = in6mask128; /* * Always initialize ia_dstaddr (= broadcast address) to loopback * address. Follows IPv4 practice - see in_ifinit(). */ ifra.ifra_dstaddr.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_dstaddr.sin6_family = AF_INET6; ifra.ifra_dstaddr.sin6_addr = in6addr_loopback; ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_addr.sin6_family = AF_INET6; ifra.ifra_addr.sin6_addr = in6addr_loopback; /* the loopback address should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* we don't need to perform DAD on loopback interfaces. */ ifra.ifra_flags |= IN6_IFF_NODAD; /* skip registration to the prefix list. XXX should be temporary. */ ifra.ifra_flags |= IN6_IFF_NOPFX; /* * We are sure that this is a newly assigned address, so we can set * NULL to the 3rd arg. */ if ((error = in6_update_ifa(ifp, &ifra, NULL, 0)) != 0) { nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure " "the loopback address on %s (errno=%d)\n", if_name(ifp), error)); return (-1); } return 0; } /* * compute NI group address, based on the current hostname setting. * see draft-ietf-ipngwg-icmp-name-lookup-* (04 and later). * * when ifp == NULL, the caller is responsible for filling scopeid. */ int in6_nigroup(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6) { const char *p; u_char *q; MD5_CTX ctxt; u_int8_t digest[16]; char l; char n[64]; /* a single label must not exceed 63 chars */ if (!namelen || !name) return -1; p = name; while (p && *p && *p != '.' && p - name < namelen) p++; if (p - name > sizeof(n) - 1) return -1; /* label too long */ l = p - name; strncpy(n, name, l); n[(int)l] = '\0'; for (q = n; *q; q++) { if ('A' <= *q && *q <= 'Z') *q = *q - 'A' + 'a'; } /* generate 8 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, &l, sizeof(l)); MD5Update(&ctxt, n, l); MD5Final(digest, &ctxt); bzero(in6, sizeof(*in6)); in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL; in6->s6_addr8[11] = 2; bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3])); if (in6_setscope(in6, ifp, NULL)) return (-1); /* XXX: should not fail */ return 0; } /* * XXX multiple loopback interface needs more care. for instance, * nodelocal address needs to be configured onto only one of them. * XXX multiple link-local address case * * altifp - secondary EUI64 source */ void in6_ifattach(struct ifnet *ifp, struct ifnet *altifp) { INIT_VNET_INET6(ifp->if_vnet); struct in6_ifaddr *ia; struct in6_addr in6; /* some of the interfaces are inherently not IPv6 capable */ switch (ifp->if_type) { case IFT_PFLOG: case IFT_PFSYNC: case IFT_CARP: return; } /* * quirks based on interface type */ switch (ifp->if_type) { #ifdef IFT_STF case IFT_STF: /* * 6to4 interface is a very special kind of beast. * no multicast, no linklocal. RFC2529 specifies how to make * linklocals for 6to4 interface, but there's no use and * it is rather harmful to have one. */ goto statinit; #endif default: break; } /* * usually, we require multicast capability to the interface */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { nd6log((LOG_INFO, "in6_ifattach: " "%s is not multicast capable, IPv6 not enabled\n", if_name(ifp))); return; } /* * assign loopback address for loopback interface. * XXX multiple loopback interface case. */ if ((ifp->if_flags & IFF_LOOPBACK) != 0) { in6 = in6addr_loopback; if (in6ifa_ifpwithaddr(ifp, &in6) == NULL) { if (in6_ifattach_loopback(ifp) != 0) return; } } /* * assign a link-local address, if there's none. */ if (V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) { ia = in6ifa_ifpforlinklocal(ifp, 0); if (ia == NULL) { if (in6_ifattach_linklocal(ifp, altifp) == 0) { /* linklocal address assigned */ } else { /* failed to assign linklocal address. bark? */ } } } #ifdef IFT_STF /* XXX */ statinit: #endif /* update dynamically. */ if (V_in6_maxmtu < ifp->if_mtu) V_in6_maxmtu = ifp->if_mtu; } /* * NOTE: in6_ifdetach() does not support loopback if at this moment. * We don't need this function in bsdi, because interfaces are never removed * from the ifnet list in bsdi. */ void in6_ifdetach(struct ifnet *ifp) { INIT_VNET_NET(ifp->if_vnet); INIT_VNET_INET(ifp->if_vnet); INIT_VNET_INET6(ifp->if_vnet); struct in6_ifaddr *ia, *oia; struct ifaddr *ifa, *next; struct rtentry *rt; short rtflags; struct sockaddr_in6 sin6; struct in6_multi_mship *imm; /* remove neighbor management table */ nd6_purge(ifp); /* nuke any of IPv6 addresses we have */ for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = next) { next = ifa->ifa_list.tqe_next; if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6_purgeaddr(ifa); } /* undo everything done by in6_ifattach(), just in case */ for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = next) { next = ifa->ifa_list.tqe_next; if (ifa->ifa_addr->sa_family != AF_INET6 || !IN6_IS_ADDR_LINKLOCAL(&satosin6(&ifa->ifa_addr)->sin6_addr)) { continue; } ia = (struct in6_ifaddr *)ifa; /* * leave from multicast groups we have joined for the interface */ while ((imm = ia->ia6_memberships.lh_first) != NULL) { LIST_REMOVE(imm, i6mm_chain); in6_leavegroup(imm); } /* remove from the routing table */ if ((ia->ia_flags & IFA_ROUTE) && (rt = rtalloc1((struct sockaddr *)&ia->ia_addr, 0, 0UL))) { rtflags = rt->rt_flags; rtfree(rt); rtrequest(RTM_DELETE, (struct sockaddr *)&ia->ia_addr, (struct sockaddr *)&ia->ia_addr, (struct sockaddr *)&ia->ia_prefixmask, rtflags, (struct rtentry **)0); } /* remove from the linked list */ TAILQ_REMOVE(&ifp->if_addrlist, (struct ifaddr *)ia, ifa_list); IFAFREE(&ia->ia_ifa); /* also remove from the IPv6 address chain(itojun&jinmei) */ oia = ia; if (oia == (ia = V_in6_ifaddr)) V_in6_ifaddr = ia->ia_next; else { while (ia->ia_next && (ia->ia_next != oia)) ia = ia->ia_next; if (ia->ia_next) ia->ia_next = oia->ia_next; else { nd6log((LOG_ERR, "%s: didn't unlink in6ifaddr from list\n", if_name(ifp))); } } IFAFREE(&oia->ia_ifa); } in6_pcbpurgeif0(&V_udbinfo, ifp); in6_pcbpurgeif0(&V_ripcbinfo, ifp); /* leave from all multicast groups joined */ in6_purgemaddrs(ifp); /* * remove neighbor management table. we call it twice just to make * sure we nuke everything. maybe we need just one call. * XXX: since the first call did not release addresses, some prefixes * might remain. We should call nd6_purge() again to release the * prefixes after removing all addresses above. * (Or can we just delay calling nd6_purge until at this point?) */ nd6_purge(ifp); /* remove route to link-local allnodes multicast (ff02::1) */ bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = in6addr_linklocal_allnodes; if (in6_setscope(&sin6.sin6_addr, ifp, NULL)) /* XXX: should not fail */ return; /* XXX grab lock first to avoid LOR */ if (V_rt_tables[0][AF_INET6] != NULL) { RADIX_NODE_HEAD_LOCK(V_rt_tables[0][AF_INET6]); rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL); if (rt) { if (rt->rt_ifp == ifp) rtexpunge(rt); RTFREE_LOCKED(rt); } RADIX_NODE_HEAD_UNLOCK(V_rt_tables[0][AF_INET6]); } } int in6_get_tmpifid(struct ifnet *ifp, u_int8_t *retbuf, const u_int8_t *baseid, int generate) { u_int8_t nullbuf[8]; struct nd_ifinfo *ndi = ND_IFINFO(ifp); bzero(nullbuf, sizeof(nullbuf)); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) == 0) { /* we've never created a random ID. Create a new one. */ generate = 1; } if (generate) { bcopy(baseid, ndi->randomseed1, sizeof(ndi->randomseed1)); /* generate_tmp_ifid will update seedn and buf */ (void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1, ndi->randomid); } bcopy(ndi->randomid, retbuf, 8); return (0); } void in6_tmpaddrtimer(void *ignored_arg) { INIT_VNET_NET(curvnet); INIT_VNET_INET6(curvnet); struct nd_ifinfo *ndi; u_int8_t nullbuf[8]; struct ifnet *ifp; callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, NULL); bzero(nullbuf, sizeof(nullbuf)); for (ifp = TAILQ_FIRST(&V_ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { ndi = ND_IFINFO(ifp); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) { /* * We've been generating a random ID on this interface. * Create a new one. */ (void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1, ndi->randomid); } } } static void in6_purgemaddrs(struct ifnet *ifp) { struct in6_multi *in6m; struct in6_multi *oin6m; #ifdef DIAGNOSTIC printf("%s: purging ifp %p\n", __func__, ifp); #endif IFF_LOCKGIANT(ifp); LIST_FOREACH_SAFE(in6m, &in6_multihead, in6m_entry, oin6m) { if (in6m->in6m_ifp == ifp) in6_delmulti(in6m); } IFF_UNLOCKGIANT(ifp); } Index: head/sys/netinet6/in6_proto.c =================================================================== --- head/sys/netinet6/in6_proto.c (revision 185087) +++ head/sys/netinet6/in6_proto.c (revision 185088) @@ -1,585 +1,568 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_proto.c,v 1.91 2001/05/27 13:28:35 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_proto.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_ipstealth.h" #include "opt_carp.h" #include "opt_sctp.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_CARP #include #endif #ifdef SCTP #include #include #include #include #include #endif /* SCTP */ #ifdef IPSEC #include #include #endif /* IPSEC */ #include /* * TCP/IP protocol family: IP6, ICMP6, UDP, TCP. */ extern struct domain inet6domain; static struct pr_usrreqs nousrreqs; #define PR_LISTEN 0 #define PR_ABRTACPTDIS 0 struct ip6protosw inet6sw[] = { { .pr_type = 0, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_init = ip6_init, .pr_slowtimo = frag6_slowtimo, .pr_drain = frag6_drain, .pr_usrreqs = &nousrreqs, }, { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_UDP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = udp6_input, .pr_ctlinput = udp6_ctlinput, .pr_ctloutput = ip6_ctloutput, .pr_usrreqs = &udp6_usrreqs, }, { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_LISTEN, .pr_input = tcp6_input, .pr_ctlinput = tcp6_ctlinput, .pr_ctloutput = tcp_ctloutput, #ifndef INET /* don't call initialization and timeout routines twice */ .pr_init = tcp_init, .pr_fasttimo = tcp_fasttimo, .pr_slowtimo = tcp_slowtimo, #endif .pr_drain = tcp_drain, .pr_usrreqs = &tcp6_usrreqs, }, #ifdef SCTP { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD, .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_drain = sctp_drain, .pr_usrreqs = &sctp6_usrreqs }, { .pr_type = SOCK_SEQPACKET, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD, .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_drain = sctp_drain, .pr_usrreqs = &sctp6_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD, .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_drain = sctp_drain, .pr_usrreqs = &sctp6_usrreqs }, #endif /* SCTP */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_RAW, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip6_input, .pr_output = rip6_output, .pr_ctlinput = rip6_ctlinput, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ICMPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = icmp6_input, .pr_output = rip6_output, .pr_ctlinput = rip6_ctlinput, .pr_ctloutput = rip6_ctloutput, .pr_init = icmp6_init, .pr_fasttimo = icmp6_fasttimo, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_DSTOPTS, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = dest6_input, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ROUTING, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = route6_input, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_FRAGMENT, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = frag6_input, .pr_usrreqs = &nousrreqs }, #ifdef IPSEC { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_AH, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec6_common_input, .pr_usrreqs = &nousrreqs, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ESP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec6_common_input, .pr_ctlinput = esp6_ctlinput, .pr_usrreqs = &nousrreqs, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPCOMP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec6_common_input, .pr_usrreqs = &nousrreqs, }, #endif /* IPSEC */ #ifdef INET { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV4, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_init = encap_init, .pr_usrreqs = &rip6_usrreqs }, #endif /* INET */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_init = encap_init, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, #ifdef DEV_CARP { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_CARP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = carp6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, #endif /* DEV_CARP */ /* raw wildcard */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, }; extern int in6_inithead(void **, int); struct domain inet6domain = { .dom_family = AF_INET6, .dom_name = "internet6", .dom_protosw = (struct protosw *)inet6sw, .dom_protoswNPROTOSW = (struct protosw *) &inet6sw[sizeof(inet6sw)/sizeof(inet6sw[0])], #ifdef RADIX_MPATH .dom_rtattach = rn6_mpath_inithead, #else .dom_rtattach = in6_inithead, #endif .dom_rtoffset = offsetof(struct sockaddr_in6, sin6_addr) << 3, .dom_maxrtkey = sizeof(struct sockaddr_in6), .dom_ifattach = in6_domifattach, .dom_ifdetach = in6_domifdetach }; DOMAIN_SET(inet6); /* * Internet configuration info */ -#ifndef IPV6FORWARDING -#ifdef GATEWAY6 -#define IPV6FORWARDING 1 /* forward IP6 packets not for us */ -#else -#define IPV6FORWARDING 0 /* don't forward IP6 packets not for us */ -#endif /* GATEWAY6 */ -#endif /* !IPV6FORWARDING */ - -#ifndef IPV6_SENDREDIRECTS -#define IPV6_SENDREDIRECTS 1 +#ifdef VIMAGE_GLOBALS +int ip6_forwarding; +int ip6_sendredirects; +int ip6_defhlim; +int ip6_defmcasthlim; +int ip6_accept_rtadv; +int ip6_maxfragpackets; +int ip6_maxfrags; +int ip6_log_interval; +int ip6_hdrnestlimit; +int ip6_dad_count; +int ip6_auto_flowlabel; +int ip6_use_deprecated; +int ip6_rr_prune; +int ip6_mcast_pmtu; +int ip6_v6only; +int ip6_keepfaith; +time_t ip6_log_time; +int ip6stealth; +int nd6_onlink_ns_rfc4861; #endif -int ip6_forwarding = IPV6FORWARDING; /* act as router? */ -int ip6_sendredirects = IPV6_SENDREDIRECTS; -int ip6_defhlim = IPV6_DEFHLIM; -int ip6_defmcasthlim = IPV6_DEFAULT_MULTICAST_HOPS; -int ip6_accept_rtadv = 0; /* "IPV6FORWARDING ? 0 : 1" is dangerous */ -int ip6_maxfragpackets; /* initialized in frag6.c:frag6_init() */ -int ip6_maxfrags; /* initialized in frag6.c:frag6_init() */ -int ip6_log_interval = 5; -int ip6_hdrnestlimit = 15; /* How many header options will we process? */ -int ip6_dad_count = 1; /* DupAddrDetectionTransmits */ -int ip6_auto_flowlabel = 1; -int ip6_gif_hlim = 0; -int ip6_use_deprecated = 1; /* allow deprecated addr (RFC2462 5.5.4) */ -int ip6_rr_prune = 5; /* router renumbering prefix - * walk list every 5 sec. */ -int ip6_mcast_pmtu = 0; /* enable pMTU discovery for multicast? */ -int ip6_v6only = 1; - -int ip6_keepfaith = 0; -time_t ip6_log_time = (time_t)0L; -#ifdef IPSTEALTH -int ip6stealth = 0; -#endif -int nd6_onlink_ns_rfc4861 = 0; /* allow 'on-link' nd6 NS (as in RFC 4861) */ - +#ifdef VIMAGE_GLOBALS /* icmp6 */ /* * BSDI4 defines these variables in in_proto.c... * XXX: what if we don't define INET? Should we define pmtu6_expire * or so? (jinmei@kame.net 19990310) */ -int pmtu_expire = 60*10; -int pmtu_probe = 60*2; +int pmtu_expire; +int pmtu_probe; /* raw IP6 parameters */ /* * Nominal space allocated to a raw ip socket. */ -#define RIPV6SNDQ 8192 -#define RIPV6RCVQ 8192 +u_long rip6_sendspace; +u_long rip6_recvspace; -u_long rip6_sendspace = RIPV6SNDQ; -u_long rip6_recvspace = RIPV6RCVQ; - /* ICMPV6 parameters */ -int icmp6_rediraccept = 1; /* accept and process redirects */ -int icmp6_redirtimeout = 10 * 60; /* 10 minutes */ -int icmp6errppslim = 100; /* 100pps */ +int icmp6_rediraccept; +int icmp6_redirtimeout; +int icmp6errppslim; /* control how to respond to NI queries */ -int icmp6_nodeinfo = (ICMP6_NODEINFO_FQDNOK|ICMP6_NODEINFO_NODEADDROK); +int icmp6_nodeinfo; /* UDP on IP6 parameters */ -int udp6_sendspace = 9216; /* really max datagram size */ -int udp6_recvspace = 40 * (1024 + sizeof(struct sockaddr_in6)); - /* 40 1K datagrams */ +int udp6_sendspace; +int udp6_recvspace; +#endif /* VIMAGE_GLOBALS */ /* * sysctl related items. */ SYSCTL_NODE(_net, PF_INET6, inet6, CTLFLAG_RW, 0, "Internet6 Family"); /* net.inet6 */ SYSCTL_NODE(_net_inet6, IPPROTO_IPV6, ip6, CTLFLAG_RW, 0, "IP6"); SYSCTL_NODE(_net_inet6, IPPROTO_ICMPV6, icmp6, CTLFLAG_RW, 0, "ICMP6"); SYSCTL_NODE(_net_inet6, IPPROTO_UDP, udp6, CTLFLAG_RW, 0, "UDP6"); SYSCTL_NODE(_net_inet6, IPPROTO_TCP, tcp6, CTLFLAG_RW, 0, "TCP6"); #ifdef SCTP SYSCTL_NODE(_net_inet6, IPPROTO_SCTP, sctp6, CTLFLAG_RW, 0, "SCTP6"); #endif #ifdef IPSEC SYSCTL_NODE(_net_inet6, IPPROTO_ESP, ipsec6, CTLFLAG_RW, 0, "IPSEC6"); #endif /* IPSEC */ /* net.inet6.ip6 */ static int sysctl_ip6_temppltime(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET6(curvnet); int error = 0; int old; error = SYSCTL_OUT(req, arg1, sizeof(int)); if (error || !req->newptr) return (error); old = V_ip6_temp_preferred_lifetime; error = SYSCTL_IN(req, arg1, sizeof(int)); if (V_ip6_temp_preferred_lifetime < V_ip6_desync_factor + V_ip6_temp_regen_advance) { V_ip6_temp_preferred_lifetime = old; return (EINVAL); } return (error); } static int sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET6(curvnet); int error = 0; int old; error = SYSCTL_OUT(req, arg1, sizeof(int)); if (error || !req->newptr) return (error); old = V_ip6_temp_valid_lifetime; error = SYSCTL_IN(req, arg1, sizeof(int)); if (V_ip6_temp_valid_lifetime < V_ip6_temp_preferred_lifetime) { V_ip6_temp_preferred_lifetime = old; return (EINVAL); } return (error); } SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_FORWARDING, forwarding, CTLFLAG_RW, ip6_forwarding, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_SENDREDIRECTS, redirect, CTLFLAG_RW, ip6_sendredirects, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_DEFHLIM, hlim, CTLFLAG_RW, ip6_defhlim, 0, ""); SYSCTL_V_STRUCT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_STATS, stats, CTLFLAG_RD, ip6stat, ip6stat, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets, CTLFLAG_RW, ip6_maxfragpackets, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv, CTLFLAG_RW, ip6_accept_rtadv, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_KEEPFAITH, keepfaith, CTLFLAG_RW, ip6_keepfaith, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_LOG_INTERVAL, log_interval, CTLFLAG_RW, ip6_log_interval, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, hdrnestlimit, CTLFLAG_RW, ip6_hdrnestlimit, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_DAD_COUNT, dad_count, CTLFLAG_RW, ip6_dad_count, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, auto_flowlabel, CTLFLAG_RW, ip6_auto_flowlabel, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, defmcasthlim, CTLFLAG_RW, ip6_defmcasthlim, 0, ""); SYSCTL_STRING(_net_inet6_ip6, IPV6CTL_KAME_VERSION, kame_version, CTLFLAG_RD, __KAME_VERSION, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_USE_DEPRECATED, use_deprecated, CTLFLAG_RW, ip6_use_deprecated, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RR_PRUNE, rr_prune, CTLFLAG_RW, ip6_rr_prune, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_USETEMPADDR, use_tempaddr, CTLFLAG_RW, ip6_use_tempaddr, 0, ""); SYSCTL_V_OID(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime, CTLTYPE_INT|CTLFLAG_RW, ip6_temp_preferred_lifetime, 0, sysctl_ip6_temppltime, "I", ""); SYSCTL_V_OID(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime, CTLTYPE_INT|CTLFLAG_RW, ip6_temp_valid_lifetime, 0, sysctl_ip6_tempvltime, "I", ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_V6ONLY, v6only, CTLFLAG_RW, ip6_v6only, 0, ""); #ifndef VIMAGE TUNABLE_INT("net.inet6.ip6.auto_linklocal", &ip6_auto_linklocal); #endif SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, auto_linklocal, CTLFLAG_RW, ip6_auto_linklocal, 0, ""); SYSCTL_V_STRUCT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, CTLFLAG_RD, rip6stat, rip6stat, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, prefer_tempaddr, CTLFLAG_RW, ip6_prefer_tempaddr, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone, CTLFLAG_RW, ip6_use_defzone, 0,""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, CTLFLAG_RW, ip6_maxfrags, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu, CTLFLAG_RW, ip6_mcast_pmtu, 0, ""); #ifdef IPSTEALTH SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_RW, ip6stealth, 0, ""); #endif /* net.inet6.icmp6 */ SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept, CTLFLAG_RW, icmp6_rediraccept, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, redirtimeout, CTLFLAG_RW, icmp6_redirtimeout, 0, ""); SYSCTL_V_STRUCT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_STATS, stats, CTLFLAG_RD, icmp6stat, icmp6stat, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, nd6_prune, CTLFLAG_RW, nd6_prune, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, nd6_delay, CTLFLAG_RW, nd6_delay, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, nd6_umaxtries, CTLFLAG_RW, nd6_umaxtries, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, nd6_mmaxtries, CTLFLAG_RW, nd6_mmaxtries, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, nd6_useloopback, CTLFLAG_RW, nd6_useloopback, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_NODEINFO, nodeinfo, CTLFLAG_RW, icmp6_nodeinfo, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, errppslimit, CTLFLAG_RW, icmp6errppslim, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, nd6_maxnudhint, CTLFLAG_RW, nd6_maxnudhint, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, nd6_debug, CTLFLAG_RW, nd6_debug, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861, nd6_onlink_ns_rfc4861, CTLFLAG_RW, &nd6_onlink_ns_rfc4861, 0, "Accept 'on-link' nd6 NS in compliance with RFC 4861."); Index: head/sys/netinet6/in6_rmx.c =================================================================== --- head/sys/netinet6/in6_rmx.c (revision 185087) +++ head/sys/netinet6/in6_rmx.c (revision 185088) @@ -1,490 +1,499 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_rmx.c,v 1.11 2001/07/26 06:53:16 jinmei Exp $ */ /*- * Copyright 1994, 1995 Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * This code does two things necessary for the enhanced TCP metrics to * function in a useful manner: * 1) It marks all non-host routes as `cloning', thus ensuring that * every actual reference to such a route actually gets turned * into a reference to a host route to the specific destination * requested. * 2) When such routes lose all their references, it arranges for them * to be deleted in some random collection of circumstances, so that * a large quantity of stale routing data is not kept in kernel memory * indefinitely. See in6_rtqtimo() below for the exact mechanism. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int in6_inithead(void **head, int off); #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ /* * Do what we need to do when inserting a route. */ static struct radix_node * in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, struct radix_node *treenodes) { struct rtentry *rt = (struct rtentry *)treenodes; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)rt_key(rt); struct radix_node *ret; if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) rt->rt_flags |= RTF_MULTICAST; /* * A little bit of help for both IPv6 output and input: * For local addresses, we make sure that RTF_LOCAL is set, * with the thought that this might one day be used to speed up * ip_input(). * * We also mark routes to multicast addresses as such, because * it's easy to do and might be useful (but this is much more * dubious since it's so easy to inspect the address). (This * is done above.) * * XXX * should elaborate the code. */ if (rt->rt_flags & RTF_HOST) { if (IN6_ARE_ADDR_EQUAL(&satosin6(rt->rt_ifa->ifa_addr) ->sin6_addr, &sin6->sin6_addr)) { rt->rt_flags |= RTF_LOCAL; } } if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp) rt->rt_rmx.rmx_mtu = IN6_LINKMTU(rt->rt_ifp); ret = rn_addroute(v_arg, n_arg, head, treenodes); if (ret == NULL && rt->rt_flags & RTF_HOST) { struct rtentry *rt2; /* * We are trying to add a host route, but can't. * Find out if it is because of an * ARP entry and delete it if so. */ rt2 = rtalloc1((struct sockaddr *)sin6, 0, RTF_CLONING); if (rt2) { if (rt2->rt_flags & RTF_LLINFO && rt2->rt_flags & RTF_HOST && rt2->rt_gateway && rt2->rt_gateway->sa_family == AF_LINK) { rtexpunge(rt2); RTFREE_LOCKED(rt2); ret = rn_addroute(v_arg, n_arg, head, treenodes); } else RTFREE_LOCKED(rt2); } } else if (ret == NULL && rt->rt_flags & RTF_CLONING) { struct rtentry *rt2; /* * We are trying to add a net route, but can't. * The following case should be allowed, so we'll make a * special check for this: * Two IPv6 addresses with the same prefix is assigned * to a single interrface. * # ifconfig if0 inet6 3ffe:0501::1 prefix 64 alias (*1) * # ifconfig if0 inet6 3ffe:0501::2 prefix 64 alias (*2) * In this case, (*1) and (*2) want to add the same * net route entry, 3ffe:0501:: -> if0. * This case should not raise an error. */ rt2 = rtalloc1((struct sockaddr *)sin6, 0, RTF_CLONING); if (rt2) { if ((rt2->rt_flags & (RTF_CLONING|RTF_HOST|RTF_GATEWAY)) == RTF_CLONING && rt2->rt_gateway && rt2->rt_gateway->sa_family == AF_LINK && rt2->rt_ifp == rt->rt_ifp) { ret = rt2->rt_nodes; } RTFREE_LOCKED(rt2); } } return ret; } /* * This code is the inverse of in6_clsroute: on first reference, if we * were managing the route, stop doing so and set the expiration timer * back off again. */ static struct radix_node * in6_matroute(void *v_arg, struct radix_node_head *head) { struct radix_node *rn = rn_match(v_arg, head); struct rtentry *rt = (struct rtentry *)rn; if (rt && rt->rt_refcnt == 0) { /* this is first reference */ if (rt->rt_flags & RTPRF_OURS) { rt->rt_flags &= ~RTPRF_OURS; rt->rt_rmx.rmx_expire = 0; } } return rn; } SYSCTL_DECL(_net_inet6_ip6); -static int rtq_reallyold6 = 60*60; - /* one hour is ``really old'' */ +#ifdef VIMAGE_GLOBALS +static int rtq_reallyold6; +static int rtq_minreallyold6; +static int rtq_toomany6; +#endif + SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTEXPIRE, rtexpire, CTLFLAG_RW, &rtq_reallyold6 , 0, ""); -static int rtq_minreallyold6 = 10; - /* never automatically crank down to less */ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, &rtq_minreallyold6 , 0, ""); -static int rtq_toomany6 = 128; - /* 128 cached routes is ``too many'' */ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, &rtq_toomany6 , 0, ""); /* * On last reference drop, mark the route as belong to us so that it can be * timed out. */ static void in6_clsroute(struct radix_node *rn, struct radix_node_head *head) { INIT_VNET_INET6(curvnet); struct rtentry *rt = (struct rtentry *)rn; RT_LOCK_ASSERT(rt); if (!(rt->rt_flags & RTF_UP)) return; /* prophylactic measures */ if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST) return; if ((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS)) != RTF_WASCLONED) return; /* * As requested by David Greenman: * If rtq_reallyold6 is 0, just delete the route without * waiting for a timeout cycle to kill it. */ if (V_rtq_reallyold6 != 0) { rt->rt_flags |= RTPRF_OURS; rt->rt_rmx.rmx_expire = time_uptime + V_rtq_reallyold6; } else { rtexpunge(rt); } } struct rtqk_arg { struct radix_node_head *rnh; int mode; int updating; int draining; int killed; int found; time_t nextstop; }; /* * Get rid of old routes. When draining, this deletes everything, even when * the timeout is not expired yet. When updating, this makes sure that * nothing has a timeout longer than the current value of rtq_reallyold6. */ static int in6_rtqkill(struct radix_node *rn, void *rock) { INIT_VNET_INET6(curvnet); struct rtqk_arg *ap = rock; struct rtentry *rt = (struct rtentry *)rn; int err; if (rt->rt_flags & RTPRF_OURS) { ap->found++; if (ap->draining || rt->rt_rmx.rmx_expire <= time_uptime) { if (rt->rt_refcnt > 0) panic("rtqkill route really not free"); err = rtrequest(RTM_DELETE, (struct sockaddr *)rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0); if (err) { log(LOG_WARNING, "in6_rtqkill: error %d", err); } else { ap->killed++; } } else { if (ap->updating && (rt->rt_rmx.rmx_expire - time_uptime > V_rtq_reallyold6)) { rt->rt_rmx.rmx_expire = time_uptime + V_rtq_reallyold6; } ap->nextstop = lmin(ap->nextstop, rt->rt_rmx.rmx_expire); } } return 0; } #define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ -static int rtq_timeout6 = RTQ_TIMEOUT; +#ifdef VIMAGE_GLOBALS +static int rtq_timeout6; static struct callout rtq_timer6; +#endif static void in6_rtqtimo(void *rock) { CURVNET_SET_QUIET((struct vnet *) rock); INIT_VNET_NET((struct vnet *) rock); INIT_VNET_INET6((struct vnet *) rock); struct radix_node_head *rnh = rock; struct rtqk_arg arg; struct timeval atv; static time_t last_adjusted_timeout = 0; arg.found = arg.killed = 0; arg.rnh = rnh; arg.nextstop = time_uptime + V_rtq_timeout6; arg.draining = arg.updating = 0; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in6_rtqkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); /* * Attempt to be somewhat dynamic about this: * If there are ``too many'' routes sitting around taking up space, * then crank down the timeout, and see if we can't make some more * go away. However, we make sure that we will never adjust more * than once in rtq_timeout6 seconds, to keep from cranking down too * hard. */ if ((arg.found - arg.killed > V_rtq_toomany6) && (time_uptime - last_adjusted_timeout >= V_rtq_timeout6) && V_rtq_reallyold6 > V_rtq_minreallyold6) { V_rtq_reallyold6 = 2*V_rtq_reallyold6 / 3; if (V_rtq_reallyold6 < V_rtq_minreallyold6) { V_rtq_reallyold6 = V_rtq_minreallyold6; } last_adjusted_timeout = time_uptime; #ifdef DIAGNOSTIC log(LOG_DEBUG, "in6_rtqtimo: adjusted rtq_reallyold6 to %d", V_rtq_reallyold6); #endif arg.found = arg.killed = 0; arg.updating = 1; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in6_rtqkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); } atv.tv_usec = 0; atv.tv_sec = arg.nextstop - time_uptime; callout_reset(&V_rtq_timer6, tvtohz(&atv), in6_rtqtimo, rock); CURVNET_RESTORE(); } /* * Age old PMTUs. */ struct mtuex_arg { struct radix_node_head *rnh; time_t nextstop; }; +#ifdef VIMAGE_GLOBALS static struct callout rtq_mtutimer; +#endif static int in6_mtuexpire(struct radix_node *rn, void *rock) { struct rtentry *rt = (struct rtentry *)rn; struct mtuex_arg *ap = rock; /* sanity */ if (!rt) panic("rt == NULL in in6_mtuexpire"); if (rt->rt_rmx.rmx_expire && !(rt->rt_flags & RTF_PROBEMTU)) { if (rt->rt_rmx.rmx_expire <= time_uptime) { rt->rt_flags |= RTF_PROBEMTU; } else { ap->nextstop = lmin(ap->nextstop, rt->rt_rmx.rmx_expire); } } return 0; } #define MTUTIMO_DEFAULT (60*1) static void in6_mtutimo(void *rock) { CURVNET_SET_QUIET((struct vnet *) rock); INIT_VNET_NET((struct vnet *) rock); INIT_VNET_INET6((struct vnet *) rock); struct radix_node_head *rnh = rock; struct mtuex_arg arg; struct timeval atv; arg.rnh = rnh; arg.nextstop = time_uptime + MTUTIMO_DEFAULT; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in6_mtuexpire, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); atv.tv_usec = 0; atv.tv_sec = arg.nextstop - time_uptime; if (atv.tv_sec < 0) { printf("invalid mtu expiration time on routing table\n"); arg.nextstop = time_uptime + 30; /* last resort */ atv.tv_sec = 30; } callout_reset(&V_rtq_mtutimer, tvtohz(&atv), in6_mtutimo, rock); CURVNET_RESTORE(); } #if 0 void in6_rtqdrain(void) { INIT_VNET_NET(curvnet); struct radix_node_head *rnh = V_rt_tables[AF_INET6]; struct rtqk_arg arg; arg.found = arg.killed = 0; arg.rnh = rnh; arg.nextstop = 0; arg.draining = 1; arg.updating = 0; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in6_rtqkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); } #endif /* * Initialize our routing tree. * XXX MRT When off == 0, we are being called from vfs_export.c * so just set up their table and leave. (we know what the correct * value should be so just use that).. FIX AFTER RELENG_7 is MFC'd * see also comments in in_inithead() vfs_export.c and domain.h */ int in6_inithead(void **head, int off) { INIT_VNET_INET6(curvnet); struct radix_node_head *rnh; if (!rn_inithead(head, offsetof(struct sockaddr_in6, sin6_addr) << 3)) return 0; /* See above */ if (off == 0) /* See above */ return 1; /* only do the rest for the real thing */ + + V_rtq_reallyold6 = 60*60; /* one hour is ``really old'' */ + V_rtq_minreallyold6 = 10; /* never automatically crank down to less */ + V_rtq_toomany6 = 128; /* 128 cached routes is ``too many'' */ + V_rtq_timeout6 = RTQ_TIMEOUT; rnh = *head; rnh->rnh_addaddr = in6_addroute; rnh->rnh_matchaddr = in6_matroute; rnh->rnh_close = in6_clsroute; callout_init(&V_rtq_timer6, CALLOUT_MPSAFE); in6_rtqtimo(rnh); /* kick off timeout first time */ callout_init(&V_rtq_mtutimer, CALLOUT_MPSAFE); in6_mtutimo(rnh); /* kick off timeout first time */ return 1; } Index: head/sys/netinet6/in6_src.c =================================================================== --- head/sys/netinet6/in6_src.c (revision 185087) +++ head/sys/netinet6/in6_src.c (revision 185088) @@ -1,1125 +1,1131 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_src.c,v 1.132 2003/08/26 04:42:27 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct mtx addrsel_lock; #define ADDRSEL_LOCK_INIT() mtx_init(&addrsel_lock, "addrsel_lock", NULL, MTX_DEF) #define ADDRSEL_LOCK() mtx_lock(&addrsel_lock) #define ADDRSEL_UNLOCK() mtx_unlock(&addrsel_lock) #define ADDRSEL_LOCK_ASSERT() mtx_assert(&addrsel_lock, MA_OWNED) static struct sx addrsel_sxlock; #define ADDRSEL_SXLOCK_INIT() sx_init(&addrsel_sxlock, "addrsel_sxlock") #define ADDRSEL_SLOCK() sx_slock(&addrsel_sxlock) #define ADDRSEL_SUNLOCK() sx_sunlock(&addrsel_sxlock) #define ADDRSEL_XLOCK() sx_xlock(&addrsel_sxlock) #define ADDRSEL_XUNLOCK() sx_xunlock(&addrsel_sxlock) #define ADDR_LABEL_NOTAPP (-1) + +#ifdef VIMAGE_GLOBALS struct in6_addrpolicy defaultaddrpolicy; +int ip6_prefer_tempaddr; +#endif -int ip6_prefer_tempaddr = 0; - static int selectroute __P((struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, struct ifnet **, struct rtentry **, int, int)); static int in6_selectif __P((struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *ro, struct ifnet **)); static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *); static void init_policy_queue(void); static int add_addrsel_policyent(struct in6_addrpolicy *); static int delete_addrsel_policyent(struct in6_addrpolicy *); static int walk_addrsel_policy __P((int (*)(struct in6_addrpolicy *, void *), void *)); static int dump_addrsel_policyent(struct in6_addrpolicy *, void *); static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *); /* * Return an IPv6 address, which is the most appropriate for a given * destination and user specified options. * If necessary, this function lookups the routing table and returns * an entry to the caller for later use. */ #define REPLACE(r) do {\ if ((r) < sizeof(V_ip6stat.ip6s_sources_rule) / \ sizeof(V_ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ V_ip6stat.ip6s_sources_rule[(r)]++; \ /* { \ char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \ printf("in6_selectsrc: replace %s with %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \ } */ \ goto replace; \ } while(0) #define NEXT(r) do {\ if ((r) < sizeof(V_ip6stat.ip6s_sources_rule) / \ sizeof(V_ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ V_ip6stat.ip6s_sources_rule[(r)]++; \ /* { \ char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \ printf("in6_selectsrc: keep %s against %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \ } */ \ goto next; /* XXX: we can't use 'continue' here */ \ } while(0) #define BREAK(r) do { \ if ((r) < sizeof(V_ip6stat.ip6s_sources_rule) / \ sizeof(V_ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ V_ip6stat.ip6s_sources_rule[(r)]++; \ goto out; /* XXX: we can't use 'break' here */ \ } while(0) struct in6_addr * in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct inpcb *inp, struct route_in6 *ro, struct ucred *cred, struct ifnet **ifpp, int *errorp) { INIT_VNET_INET6(curvnet); struct in6_addr dst; struct ifnet *ifp = NULL; struct in6_ifaddr *ia = NULL, *ia_best = NULL; struct in6_pktinfo *pi = NULL; int dst_scope = -1, best_scope = -1, best_matchlen = -1; struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL; u_int32_t odstzone; int prefer_tempaddr; struct ip6_moptions *mopts; dst = dstsock->sin6_addr; /* make a copy for local operation */ *errorp = 0; if (ifpp) *ifpp = NULL; if (inp != NULL) { INP_LOCK_ASSERT(inp); mopts = inp->in6p_moptions; } else { mopts = NULL; } /* * If the source address is explicitly specified by the caller, * check if the requested source address is indeed a unicast address * assigned to the node, and can be used as the packet's source * address. If everything is okay, use the address as source. */ if (opts && (pi = opts->ip6po_pktinfo) && !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) { struct sockaddr_in6 srcsock; struct in6_ifaddr *ia6; /* get the outgoing interface */ if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, &ifp)) != 0) { return (NULL); } /* * determine the appropriate zone id of the source based on * the zone of the destination and the outgoing interface. * If the specified address is ambiguous wrt the scope zone, * the interface must be specified; otherwise, ifa_ifwithaddr() * will fail matching the address. */ bzero(&srcsock, sizeof(srcsock)); srcsock.sin6_family = AF_INET6; srcsock.sin6_len = sizeof(srcsock); srcsock.sin6_addr = pi->ipi6_addr; if (ifp) { *errorp = in6_setscope(&srcsock.sin6_addr, ifp, NULL); if (*errorp != 0) return (NULL); } ia6 = (struct in6_ifaddr *)ifa_ifwithaddr((struct sockaddr *)(&srcsock)); if (ia6 == NULL || (ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY))) { *errorp = EADDRNOTAVAIL; return (NULL); } pi->ipi6_addr = srcsock.sin6_addr; /* XXX: this overrides pi */ if (ifpp) *ifpp = ifp; return (&ia6->ia_addr.sin6_addr); } /* * Otherwise, if the socket has already bound the source, just use it. */ if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { return (&inp->in6p_laddr); } /* * If the address is not specified, choose the best one based on * the outgoing interface and the destination address. */ /* get the outgoing interface */ if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, &ifp)) != 0) return (NULL); #ifdef DIAGNOSTIC if (ifp == NULL) /* this should not happen */ panic("in6_selectsrc: NULL ifp"); #endif *errorp = in6_setscope(&dst, ifp, &odstzone); if (*errorp != 0) return (NULL); for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) { int new_scope = -1, new_matchlen = -1; struct in6_addrpolicy *new_policy = NULL; u_int32_t srczone, osrczone, dstzone; struct in6_addr src; struct ifnet *ifp1 = ia->ia_ifp; /* * We'll never take an address that breaks the scope zone * of the destination. We also skip an address if its zone * does not contain the outgoing interface. * XXX: we should probably use sin6_scope_id here. */ if (in6_setscope(&dst, ifp1, &dstzone) || odstzone != dstzone) { continue; } src = ia->ia_addr.sin6_addr; if (in6_setscope(&src, ifp, &osrczone) || in6_setscope(&src, ifp1, &srczone) || osrczone != srczone) { continue; } /* avoid unusable addresses */ if ((ia->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED))) { continue; } if (!V_ip6_use_deprecated && IFA6_IS_DEPRECATED(ia)) continue; /* Rule 1: Prefer same address */ if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) { ia_best = ia; BREAK(1); /* there should be no better candidate */ } if (ia_best == NULL) REPLACE(0); /* Rule 2: Prefer appropriate scope */ if (dst_scope < 0) dst_scope = in6_addrscope(&dst); new_scope = in6_addrscope(&ia->ia_addr.sin6_addr); if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) { if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0) REPLACE(2); NEXT(2); } else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) { if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0) NEXT(2); REPLACE(2); } /* * Rule 3: Avoid deprecated addresses. Note that the case of * !ip6_use_deprecated is already rejected above. */ if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia)) NEXT(3); if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia)) REPLACE(3); /* Rule 4: Prefer home addresses */ /* * XXX: This is a TODO. We should probably merge the MIP6 * case above. */ /* Rule 5: Prefer outgoing interface */ if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp) NEXT(5); if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp) REPLACE(5); /* * Rule 6: Prefer matching label * Note that best_policy should be non-NULL here. */ if (dst_policy == NULL) dst_policy = lookup_addrsel_policy(dstsock); if (dst_policy->label != ADDR_LABEL_NOTAPP) { new_policy = lookup_addrsel_policy(&ia->ia_addr); if (dst_policy->label == best_policy->label && dst_policy->label != new_policy->label) NEXT(6); if (dst_policy->label != best_policy->label && dst_policy->label == new_policy->label) REPLACE(6); } /* * Rule 7: Prefer public addresses. * We allow users to reverse the logic by configuring * a sysctl variable, so that privacy conscious users can * always prefer temporary addresses. */ if (opts == NULL || opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) { prefer_tempaddr = V_ip6_prefer_tempaddr; } else if (opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_NOTPREFER) { prefer_tempaddr = 0; } else prefer_tempaddr = 1; if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) && (ia->ia6_flags & IN6_IFF_TEMPORARY)) { if (prefer_tempaddr) REPLACE(7); else NEXT(7); } if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) && !(ia->ia6_flags & IN6_IFF_TEMPORARY)) { if (prefer_tempaddr) NEXT(7); else REPLACE(7); } /* * Rule 8: prefer addresses on alive interfaces. * This is a KAME specific rule. */ if ((ia_best->ia_ifp->if_flags & IFF_UP) && !(ia->ia_ifp->if_flags & IFF_UP)) NEXT(8); if (!(ia_best->ia_ifp->if_flags & IFF_UP) && (ia->ia_ifp->if_flags & IFF_UP)) REPLACE(8); /* * Rule 14: Use longest matching prefix. * Note: in the address selection draft, this rule is * documented as "Rule 8". However, since it is also * documented that this rule can be overridden, we assign * a large number so that it is easy to assign smaller numbers * to more preferred rules. */ new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst); if (best_matchlen < new_matchlen) REPLACE(14); if (new_matchlen < best_matchlen) NEXT(14); /* Rule 15 is reserved. */ /* * Last resort: just keep the current candidate. * Or, do we need more rules? */ continue; replace: ia_best = ia; best_scope = (new_scope >= 0 ? new_scope : in6_addrscope(&ia_best->ia_addr.sin6_addr)); best_policy = (new_policy ? new_policy : lookup_addrsel_policy(&ia_best->ia_addr)); best_matchlen = (new_matchlen >= 0 ? new_matchlen : in6_matchlen(&ia_best->ia_addr.sin6_addr, &dst)); next: continue; out: break; } if ((ia = ia_best) == NULL) { *errorp = EADDRNOTAVAIL; return (NULL); } if (ifpp) *ifpp = ifp; return (&ia->ia_addr.sin6_addr); } /* * clone - meaningful only for bsdi and freebsd */ static int selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, struct rtentry **retrt, int clone, int norouteok) { INIT_VNET_NET(curvnet); INIT_VNET_INET6(curvnet); int error = 0; struct ifnet *ifp = NULL; struct rtentry *rt = NULL; struct sockaddr_in6 *sin6_next; struct in6_pktinfo *pi = NULL; struct in6_addr *dst = &dstsock->sin6_addr; #if 0 char ip6buf[INET6_ADDRSTRLEN]; if (dstsock->sin6_addr.s6_addr32[0] == 0 && dstsock->sin6_addr.s6_addr32[1] == 0 && !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) { printf("in6_selectroute: strange destination %s\n", ip6_sprintf(ip6buf, &dstsock->sin6_addr)); } else { printf("in6_selectroute: destination = %s%%%d\n", ip6_sprintf(ip6buf, &dstsock->sin6_addr), dstsock->sin6_scope_id); /* for debug */ } #endif /* If the caller specify the outgoing interface explicitly, use it. */ if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) { /* XXX boundary check is assumed to be already done. */ ifp = ifnet_byindex(pi->ipi6_ifindex); if (ifp != NULL && (norouteok || retrt == NULL || IN6_IS_ADDR_MULTICAST(dst))) { /* * we do not have to check or get the route for * multicast. */ goto done; } else goto getroute; } /* * If the destination address is a multicast address and the outgoing * interface for the address is specified by the caller, use it. */ if (IN6_IS_ADDR_MULTICAST(dst) && mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) { goto done; /* we do not need a route for multicast. */ } getroute: /* * If the next hop address for the packet is specified by the caller, * use it as the gateway. */ if (opts && opts->ip6po_nexthop) { struct route_in6 *ron; sin6_next = satosin6(opts->ip6po_nexthop); /* at this moment, we only support AF_INET6 next hops */ if (sin6_next->sin6_family != AF_INET6) { error = EAFNOSUPPORT; /* or should we proceed? */ goto done; } /* * If the next hop is an IPv6 address, then the node identified * by that address must be a neighbor of the sending host. */ ron = &opts->ip6po_nextroute; if ((ron->ro_rt && (ron->ro_rt->rt_flags & (RTF_UP | RTF_LLINFO)) != (RTF_UP | RTF_LLINFO)) || !IN6_ARE_ADDR_EQUAL(&satosin6(&ron->ro_dst)->sin6_addr, &sin6_next->sin6_addr)) { if (ron->ro_rt) { RTFREE(ron->ro_rt); ron->ro_rt = NULL; } *satosin6(&ron->ro_dst) = *sin6_next; } if (ron->ro_rt == NULL) { rtalloc((struct route *)ron); /* multi path case? */ if (ron->ro_rt == NULL || !(ron->ro_rt->rt_flags & RTF_LLINFO)) { if (ron->ro_rt) { RTFREE(ron->ro_rt); ron->ro_rt = NULL; } error = EHOSTUNREACH; goto done; } } rt = ron->ro_rt; ifp = rt->rt_ifp; /* * When cloning is required, try to allocate a route to the * destination so that the caller can store path MTU * information. */ if (!clone) goto done; } /* * Use a cached route if it exists and is valid, else try to allocate * a new one. Note that we should check the address family of the * cached destination, in case of sharing the cache with IPv4. */ if (ro) { if (ro->ro_rt && (!(ro->ro_rt->rt_flags & RTF_UP) || ((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, dst))) { RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)NULL; } if (ro->ro_rt == (struct rtentry *)NULL) { struct sockaddr_in6 *sa6; /* No route yet, so try to acquire one */ bzero(&ro->ro_dst, sizeof(struct sockaddr_in6)); sa6 = (struct sockaddr_in6 *)&ro->ro_dst; *sa6 = *dstsock; sa6->sin6_scope_id = 0; if (clone) { #ifdef RADIX_MPATH rtalloc_mpath((struct route *)ro, ntohl(sa6->sin6_addr.s6_addr32[3])); #else rtalloc((struct route *)ro); #endif } else { ro->ro_rt = rtalloc1(&((struct route *)ro) ->ro_dst, 0, 0UL); if (ro->ro_rt) RT_UNLOCK(ro->ro_rt); } } /* * do not care about the result if we have the nexthop * explicitly specified. */ if (opts && opts->ip6po_nexthop) goto done; if (ro->ro_rt) { ifp = ro->ro_rt->rt_ifp; if (ifp == NULL) { /* can this really happen? */ RTFREE(ro->ro_rt); ro->ro_rt = NULL; } } if (ro->ro_rt == NULL) error = EHOSTUNREACH; rt = ro->ro_rt; /* * Check if the outgoing interface conflicts with * the interface specified by ipi6_ifindex (if specified). * Note that loopback interface is always okay. * (this may happen when we are sending a packet to one of * our own addresses.) */ if (ifp && opts && opts->ip6po_pktinfo && opts->ip6po_pktinfo->ipi6_ifindex) { if (!(ifp->if_flags & IFF_LOOPBACK) && ifp->if_index != opts->ip6po_pktinfo->ipi6_ifindex) { error = EHOSTUNREACH; goto done; } } } done: if (ifp == NULL && rt == NULL) { /* * This can happen if the caller did not pass a cached route * nor any other hints. We treat this case an error. */ error = EHOSTUNREACH; } if (error == EHOSTUNREACH) V_ip6stat.ip6s_noroute++; if (retifp != NULL) *retifp = ifp; if (retrt != NULL) *retrt = rt; /* rt may be NULL */ return (error); } static int in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp) { int error; struct route_in6 sro; struct rtentry *rt = NULL; if (ro == NULL) { bzero(&sro, sizeof(sro)); ro = &sro; } if ((error = selectroute(dstsock, opts, mopts, ro, retifp, &rt, 0, 1)) != 0) { if (ro == &sro && rt && rt == sro.ro_rt) RTFREE(rt); return (error); } /* * do not use a rejected or black hole route. * XXX: this check should be done in the L2 output routine. * However, if we skipped this check here, we'd see the following * scenario: * - install a rejected route for a scoped address prefix * (like fe80::/10) * - send a packet to a destination that matches the scoped prefix, * with ambiguity about the scope zone. * - pick the outgoing interface from the route, and disambiguate the * scope zone with the interface. * - ip6_output() would try to get another route with the "new" * destination, which may be valid. * - we'd see no error on output. * Although this may not be very harmful, it should still be confusing. * We thus reject the case here. */ if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) { int flags = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); if (ro == &sro && rt && rt == sro.ro_rt) RTFREE(rt); return (flags); } /* * Adjust the "outgoing" interface. If we're going to loop the packet * back to ourselves, the ifp would be the loopback interface. * However, we'd rather know the interface associated to the * destination address (which should probably be one of our own * addresses.) */ if (rt && rt->rt_ifa && rt->rt_ifa->ifa_ifp) *retifp = rt->rt_ifa->ifa_ifp; if (ro == &sro && rt && rt == sro.ro_rt) RTFREE(rt); return (0); } /* * clone - meaningful only for bsdi and freebsd */ int in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, struct rtentry **retrt, int clone) { return (selectroute(dstsock, opts, mopts, ro, retifp, retrt, clone, 0)); } /* * Default hop limit selection. The precedence is as follows: * 1. Hoplimit value specified via ioctl. * 2. (If the outgoing interface is detected) the current * hop limit of the interface specified by router advertisement. * 3. The system default hoplimit. */ int in6_selecthlim(struct in6pcb *in6p, struct ifnet *ifp) { INIT_VNET_INET6(curvnet); if (in6p && in6p->in6p_hops >= 0) return (in6p->in6p_hops); else if (ifp) return (ND_IFINFO(ifp)->chlim); else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) { struct route_in6 ro6; struct ifnet *lifp; bzero(&ro6, sizeof(ro6)); ro6.ro_dst.sin6_family = AF_INET6; ro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); ro6.ro_dst.sin6_addr = in6p->in6p_faddr; rtalloc((struct route *)&ro6); if (ro6.ro_rt) { lifp = ro6.ro_rt->rt_ifp; RTFREE(ro6.ro_rt); if (lifp) return (ND_IFINFO(lifp)->chlim); } else return (V_ip6_defhlim); } return (V_ip6_defhlim); } /* * XXX: this is borrowed from in6_pcbbind(). If possible, we should * share this function by all *bsd*... */ int in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct ucred *cred) { INIT_VNET_INET(curvnet); struct socket *so = inp->inp_socket; u_int16_t lport = 0, first, last, *lastport; int count, error = 0, wild = 0, dorandom; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); /* XXX: this is redundant when called from in6_pcbbind */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) wild = INPLOOKUP_WILDCARD; inp->inp_flags |= INP_ANONPORT; if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); if (error) return error; first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * For UDP, use random port allocation as long as the user * allows it. For TCP (and as of yet unknown) connections, * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ if (V_ipport_randomized && (!V_ipport_stoprandom || pcbinfo == &V_udbinfo)) dorandom = 1; else dorandom = 0; /* * It makes no sense to do random port allocation if * we have the only port available. */ if (first == last) dorandom = 0; /* Make sure to not include UDP packets in the count. */ if (pcbinfo != &V_udbinfo) V_ipport_tcpallocs++; /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { u_int16_t aux; aux = first; first = last; last = aux; } if (dorandom) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) { /* completely used? */ /* Undo an address bind that may have occurred. */ inp->in6p_laddr = in6addr_any; return (EADDRNOTAVAIL); } ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); } while (in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, wild, cred)); inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; return (EAGAIN); } return (0); } void addrsel_policy_init(void) { ADDRSEL_LOCK_INIT(); ADDRSEL_SXLOCK_INIT(); INIT_VNET_INET6(curvnet); + V_ip6_prefer_tempaddr = 0; + init_policy_queue(); /* initialize the "last resort" policy */ bzero(&V_defaultaddrpolicy, sizeof(V_defaultaddrpolicy)); V_defaultaddrpolicy.label = ADDR_LABEL_NOTAPP; } static struct in6_addrpolicy * lookup_addrsel_policy(struct sockaddr_in6 *key) { INIT_VNET_INET6(curvnet); struct in6_addrpolicy *match = NULL; ADDRSEL_LOCK(); match = match_addrsel_policy(key); if (match == NULL) match = &V_defaultaddrpolicy; else match->use++; ADDRSEL_UNLOCK(); return (match); } /* * Subroutines to manage the address selection policy table via sysctl. */ struct walkarg { struct sysctl_req *w_req; }; static int in6_src_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy, CTLFLAG_RD, in6_src_sysctl, ""); static int in6_src_sysctl(SYSCTL_HANDLER_ARGS) { struct walkarg w; if (req->newptr) return EPERM; bzero(&w, sizeof(w)); w.w_req = req; return (walk_addrsel_policy(dump_addrsel_policyent, &w)); } int in6_src_ioctl(u_long cmd, caddr_t data) { int i; struct in6_addrpolicy ent0; if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY) return (EOPNOTSUPP); /* check for safety */ ent0 = *(struct in6_addrpolicy *)data; if (ent0.label == ADDR_LABEL_NOTAPP) return (EINVAL); /* check if the prefix mask is consecutive. */ if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0) return (EINVAL); /* clear trailing garbages (if any) of the prefix address. */ for (i = 0; i < 4; i++) { ent0.addr.sin6_addr.s6_addr32[i] &= ent0.addrmask.sin6_addr.s6_addr32[i]; } ent0.use = 0; switch (cmd) { case SIOCAADDRCTL_POLICY: return (add_addrsel_policyent(&ent0)); case SIOCDADDRCTL_POLICY: return (delete_addrsel_policyent(&ent0)); } return (0); /* XXX: compromise compilers */ } /* * The followings are implementation of the policy table using a * simple tail queue. * XXX such details should be hidden. * XXX implementation using binary tree should be more efficient. */ struct addrsel_policyent { TAILQ_ENTRY(addrsel_policyent) ape_entry; struct in6_addrpolicy ape_policy; }; TAILQ_HEAD(addrsel_policyhead, addrsel_policyent); +#ifdef VIMAGE_GLOBALS struct addrsel_policyhead addrsel_policytab; +#endif static void init_policy_queue(void) { INIT_VNET_INET6(curvnet); TAILQ_INIT(&V_addrsel_policytab); } static int add_addrsel_policyent(struct in6_addrpolicy *newpolicy) { INIT_VNET_INET6(curvnet); struct addrsel_policyent *new, *pol; new = malloc(sizeof(*new), M_IFADDR, M_WAITOK); ADDRSEL_XLOCK(); ADDRSEL_LOCK(); /* duplication check */ TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) && IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr, &pol->ape_policy.addrmask.sin6_addr)) { ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); free(new, M_IFADDR); return (EEXIST); /* or override it? */ } } bzero(new, sizeof(*new)); /* XXX: should validate entry */ new->ape_policy = *newpolicy; TAILQ_INSERT_TAIL(&V_addrsel_policytab, new, ape_entry); ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); return (0); } static int delete_addrsel_policyent(struct in6_addrpolicy *key) { INIT_VNET_INET6(curvnet); struct addrsel_policyent *pol; ADDRSEL_XLOCK(); ADDRSEL_LOCK(); /* search for the entry in the table */ TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) && IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr, &pol->ape_policy.addrmask.sin6_addr)) { break; } } if (pol == NULL) { ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); return (ESRCH); } TAILQ_REMOVE(&V_addrsel_policytab, pol, ape_entry); ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); return (0); } static int walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *), void *w) { INIT_VNET_INET6(curvnet); struct addrsel_policyent *pol; int error = 0; ADDRSEL_SLOCK(); TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if ((error = (*callback)(&pol->ape_policy, w)) != 0) { ADDRSEL_SUNLOCK(); return (error); } } ADDRSEL_SUNLOCK(); return (error); } static int dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg) { int error = 0; struct walkarg *w = arg; error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol)); return (error); } static struct in6_addrpolicy * match_addrsel_policy(struct sockaddr_in6 *key) { INIT_VNET_INET6(curvnet); struct addrsel_policyent *pent; struct in6_addrpolicy *bestpol = NULL, *pol; int matchlen, bestmatchlen = -1; u_char *mp, *ep, *k, *p, m; TAILQ_FOREACH(pent, &V_addrsel_policytab, ape_entry) { matchlen = 0; pol = &pent->ape_policy; mp = (u_char *)&pol->addrmask.sin6_addr; ep = mp + 16; /* XXX: scope field? */ k = (u_char *)&key->sin6_addr; p = (u_char *)&pol->addr.sin6_addr; for (; mp < ep && *mp; mp++, k++, p++) { m = *mp; if ((*k & m) != *p) goto next; /* not match */ if (m == 0xff) /* short cut for a typical case */ matchlen += 8; else { while (m >= 0x80) { matchlen++; m <<= 1; } } } /* matched. check if this is better than the current best. */ if (bestpol == NULL || matchlen > bestmatchlen) { bestpol = pol; bestmatchlen = matchlen; } next: continue; } return (bestpol); } Index: head/sys/netinet6/ip6_forward.c =================================================================== --- head/sys/netinet6/ip6_forward.c (revision 185087) +++ head/sys/netinet6/ip6_forward.c (revision 185088) @@ -1,659 +1,661 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_forward.c,v 1.69 2001/05/17 03:48:30 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_ipstealth.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #include #endif /* IPSEC */ #include +#ifdef VIMAGE_GLOBALS struct route_in6 ip6_forward_rt; +#endif /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * */ void ip6_forward(struct mbuf *m, int srcrt) { INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct sockaddr_in6 *dst = NULL; struct rtentry *rt = NULL; int error, type = 0, code = 0; struct mbuf *mcopy = NULL; struct ifnet *origifp; /* maybe unnecessary */ u_int32_t inzone, outzone; struct in6_addr src_in6, dst_in6; #ifdef IPSEC INIT_VNET_IPSEC(curvnet); struct secpolicy *sp = NULL; int ipsecrt = 0; #endif char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; /* GIANT_REQUIRED; */ /* XXX bz: ip6_forward_rt */ #ifdef IPSEC /* * Check AH/ESP integrity. */ /* * Don't increment ip6s_cantforward because this is the check * before forwarding packet actually. */ if (ipsec6_in_reject(m, NULL)) { V_ipsec6stat.in_polvio++; m_freem(m); return; } #endif /* IPSEC */ /* * Do not forward packets to multicast destination (should be handled * by ip6_mforward(). * Do not forward packets with unspecified source. It was discussed * in July 2000, on the ipngwg mailing list. */ if ((m->m_flags & (M_BCAST|M_MCAST)) != 0 || IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { V_ip6stat.ip6s_cantforward++; /* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */ if (V_ip6_log_time + V_ip6_log_interval < time_second) { V_ip6_log_time = time_second; log(LOG_DEBUG, "cannot forward " "from %s to %s nxt %d received on %s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif)); } m_freem(m); return; } #ifdef IPSTEALTH if (!V_ip6stealth) { #endif if (ip6->ip6_hlim <= IPV6_HLIMDEC) { /* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */ icmp6_error(m, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); return; } ip6->ip6_hlim -= IPV6_HLIMDEC; #ifdef IPSTEALTH } #endif /* * Save at most ICMPV6_PLD_MAXLEN (= the min IPv6 MTU - * size of IPv6 + ICMPv6 headers) bytes of the packet in case * we need to generate an ICMP6 message to the src. * Thanks to M_EXT, in most cases copy will not occur. * * It is important to save it before IPsec processing as IPsec * processing may modify the mbuf. */ mcopy = m_copy(m, 0, imin(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN)); #ifdef IPSEC /* get a security policy for this packet */ sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, IP_FORWARDING, &error); if (sp == NULL) { V_ipsec6stat.out_inval++; V_ip6stat.ip6s_cantforward++; if (mcopy) { #if 0 /* XXX: what icmp ? */ #else m_freem(mcopy); #endif } m_freem(m); return; } error = 0; /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: /* * This packet is just discarded. */ V_ipsec6stat.out_polvio++; V_ip6stat.ip6s_cantforward++; KEY_FREESP(&sp); if (mcopy) { #if 0 /* XXX: what icmp ? */ #else m_freem(mcopy); #endif } m_freem(m); return; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: /* no need to do IPsec. */ KEY_FREESP(&sp); goto skip_ipsec; case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { /* XXX should be panic ? */ printf("ip6_forward: No IPsec request specified.\n"); V_ip6stat.ip6s_cantforward++; KEY_FREESP(&sp); if (mcopy) { #if 0 /* XXX: what icmp ? */ #else m_freem(mcopy); #endif } m_freem(m); return; } /* do IPsec */ break; case IPSEC_POLICY_ENTRUST: default: /* should be panic ?? */ printf("ip6_forward: Invalid policy found. %d\n", sp->policy); KEY_FREESP(&sp); goto skip_ipsec; } { struct ipsecrequest *isr = NULL; struct ipsec_output_state state; /* * when the kernel forwards a packet, it is not proper to apply * IPsec transport mode to the packet is not proper. this check * avoid from this. * at present, if there is even a transport mode SA request in the * security policy, the kernel does not apply IPsec to the packet. * this check is not enough because the following case is valid. * ipsec esp/tunnel/xxx-xxx/require esp/transport//require; */ for (isr = sp->req; isr; isr = isr->next) { if (isr->saidx.mode == IPSEC_MODE_ANY) goto doipsectunnel; if (isr->saidx.mode == IPSEC_MODE_TUNNEL) goto doipsectunnel; } /* * if there's no need for tunnel mode IPsec, skip. */ if (!isr) goto skip_ipsec; doipsectunnel: /* * All the extension headers will become inaccessible * (since they can be encrypted). * Don't panic, we need no more updates to extension headers * on inner IPv6 packet (since they are now encapsulated). * * IPv6 [ESP|AH] IPv6 [extension headers] payload */ bzero(&state, sizeof(state)); state.m = m; state.ro = NULL; /* update at ipsec6_output_tunnel() */ state.dst = NULL; /* update at ipsec6_output_tunnel() */ error = ipsec6_output_tunnel(&state, sp, 0); m = state.m; KEY_FREESP(&sp); if (error) { /* mbuf is already reclaimed in ipsec6_output_tunnel. */ switch (error) { case EHOSTUNREACH: case ENETUNREACH: case EMSGSIZE: case ENOBUFS: case ENOMEM: break; default: printf("ip6_output (ipsec): error code %d\n", error); /* FALLTHROUGH */ case ENOENT: /* don't show these error codes to the user */ break; } V_ip6stat.ip6s_cantforward++; if (mcopy) { #if 0 /* XXX: what icmp ? */ #else m_freem(mcopy); #endif } m_freem(m); return; } else { /* * In the FAST IPSec case we have already * re-injected the packet and it has been freed * by the ipsec_done() function. So, just clean * up after ourselves. */ m = NULL; goto freecopy; } if ((m != NULL) && (ip6 != mtod(m, struct ip6_hdr *)) ){ /* * now tunnel mode headers are added. we are originating * packet instead of forwarding the packet. */ ip6_output(m, NULL, NULL, IPV6_FORWARDING/*XXX*/, NULL, NULL, NULL); goto freecopy; } /* adjust pointer */ dst = (struct sockaddr_in6 *)state.dst; rt = state.ro ? state.ro->ro_rt : NULL; if (dst != NULL && rt != NULL) ipsecrt = 1; } skip_ipsec: #endif /* IPSEC */ #ifdef IPSEC if (ipsecrt) goto skip_routing; #endif dst = (struct sockaddr_in6 *)&V_ip6_forward_rt.ro_dst; if (!srcrt) { /* ip6_forward_rt.ro_dst.sin6_addr is equal to ip6->ip6_dst */ if (V_ip6_forward_rt.ro_rt == 0 || (V_ip6_forward_rt.ro_rt->rt_flags & RTF_UP) == 0) { if (V_ip6_forward_rt.ro_rt) { RTFREE(V_ip6_forward_rt.ro_rt); V_ip6_forward_rt.ro_rt = 0; } /* this probably fails but give it a try again */ rtalloc((struct route *)&V_ip6_forward_rt); } if (V_ip6_forward_rt.ro_rt == 0) { V_ip6stat.ip6s_noroute++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute); if (mcopy) { icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0); } m_freem(m); return; } } else if ((rt = V_ip6_forward_rt.ro_rt) == 0 || !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &dst->sin6_addr)) { if (V_ip6_forward_rt.ro_rt) { RTFREE(V_ip6_forward_rt.ro_rt); V_ip6_forward_rt.ro_rt = 0; } bzero(dst, sizeof(*dst)); dst->sin6_len = sizeof(struct sockaddr_in6); dst->sin6_family = AF_INET6; dst->sin6_addr = ip6->ip6_dst; rtalloc((struct route *)&V_ip6_forward_rt); if (V_ip6_forward_rt.ro_rt == 0) { V_ip6stat.ip6s_noroute++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute); if (mcopy) { icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0); } m_freem(m); return; } } rt = V_ip6_forward_rt.ro_rt; #ifdef IPSEC skip_routing:; #endif /* * Source scope check: if a packet can't be delivered to its * destination for the reason that the destination is beyond the scope * of the source address, discard the packet and return an icmp6 * destination unreachable error with Code 2 (beyond scope of source * address). We use a local copy of ip6_src, since in6_setscope() * will possibly modify its first argument. * [draft-ietf-ipngwg-icmp-v3-04.txt, Section 3.1] */ src_in6 = ip6->ip6_src; if (in6_setscope(&src_in6, rt->rt_ifp, &outzone)) { /* XXX: this should not happen */ V_ip6stat.ip6s_cantforward++; V_ip6stat.ip6s_badscope++; m_freem(m); return; } if (in6_setscope(&src_in6, m->m_pkthdr.rcvif, &inzone)) { V_ip6stat.ip6s_cantforward++; V_ip6stat.ip6s_badscope++; m_freem(m); return; } if (inzone != outzone #ifdef IPSEC && !ipsecrt #endif ) { V_ip6stat.ip6s_cantforward++; V_ip6stat.ip6s_badscope++; in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard); if (V_ip6_log_time + V_ip6_log_interval < time_second) { V_ip6_log_time = time_second; log(LOG_DEBUG, "cannot forward " "src %s, dst %s, nxt %d, rcvif %s, outif %s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif), if_name(rt->rt_ifp)); } if (mcopy) icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_BEYONDSCOPE, 0); m_freem(m); return; } /* * Destination scope check: if a packet is going to break the scope * zone of packet's destination address, discard it. This case should * usually be prevented by appropriately-configured routing table, but * we need an explicit check because we may mistakenly forward the * packet to a different zone by (e.g.) a default route. */ dst_in6 = ip6->ip6_dst; if (in6_setscope(&dst_in6, m->m_pkthdr.rcvif, &inzone) != 0 || in6_setscope(&dst_in6, rt->rt_ifp, &outzone) != 0 || inzone != outzone) { V_ip6stat.ip6s_cantforward++; V_ip6stat.ip6s_badscope++; m_freem(m); return; } if (m->m_pkthdr.len > IN6_LINKMTU(rt->rt_ifp)) { in6_ifstat_inc(rt->rt_ifp, ifs6_in_toobig); if (mcopy) { u_long mtu; #ifdef IPSEC struct secpolicy *sp; int ipsecerror; size_t ipsechdrsiz; #endif /* IPSEC */ mtu = IN6_LINKMTU(rt->rt_ifp); #ifdef IPSEC /* * When we do IPsec tunnel ingress, we need to play * with the link value (decrement IPsec header size * from mtu value). The code is much simpler than v4 * case, as we have the outgoing interface for * encapsulated packet as "rt->rt_ifp". */ sp = ipsec_getpolicybyaddr(mcopy, IPSEC_DIR_OUTBOUND, IP_FORWARDING, &ipsecerror); if (sp) { ipsechdrsiz = ipsec6_hdrsiz(mcopy, IPSEC_DIR_OUTBOUND, NULL); if (ipsechdrsiz < mtu) mtu -= ipsechdrsiz; } /* * if mtu becomes less than minimum MTU, * tell minimum MTU (and I'll need to fragment it). */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU; #endif /* IPSEC */ icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, mtu); } m_freem(m); return; } if (rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in6 *)rt->rt_gateway; /* * If we are to forward the packet using the same interface * as one we got the packet from, perhaps we should send a redirect * to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a route * modified by a redirect. */ if (V_ip6_sendredirects && rt->rt_ifp == m->m_pkthdr.rcvif && !srcrt && #ifdef IPSEC !ipsecrt && #endif /* IPSEC */ (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0) { if ((rt->rt_ifp->if_flags & IFF_POINTOPOINT) != 0) { /* * If the incoming interface is equal to the outgoing * one, and the link attached to the interface is * point-to-point, then it will be highly probable * that a routing loop occurs. Thus, we immediately * drop the packet and send an ICMPv6 error message. * * type/code is based on suggestion by Rich Draves. * not sure if it is the best pick. */ icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0); m_freem(m); return; } type = ND_REDIRECT; } /* * Fake scoped addresses. Note that even link-local source or * destinaion can appear, if the originating node just sends the * packet to us (without address resolution for the destination). * Since both icmp6_error and icmp6_redirect_output fill the embedded * link identifiers, we can do this stuff after making a copy for * returning an error. */ if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { /* * See corresponding comments in ip6_output. * XXX: but is it possible that ip6_forward() sends a packet * to a loopback interface? I don't think so, and thus * I bark here. (jinmei@kame.net) * XXX: it is common to route invalid packets to loopback. * also, the codepath will be visited on use of ::1 in * rthdr. (itojun) */ #if 1 if (0) #else if ((rt->rt_flags & (RTF_BLACKHOLE|RTF_REJECT)) == 0) #endif { printf("ip6_forward: outgoing interface is loopback. " "src %s, dst %s, nxt %d, rcvif %s, outif %s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif), if_name(rt->rt_ifp)); } /* we can just use rcvif in forwarding. */ origifp = m->m_pkthdr.rcvif; } else origifp = rt->rt_ifp; /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&inet6_pfil_hook)) goto pass; /* Run through list of hooks for output packets. */ error = pfil_run_hooks(&inet6_pfil_hook, &m, rt->rt_ifp, PFIL_OUT, NULL); if (error != 0) goto senderr; if (m == NULL) goto freecopy; ip6 = mtod(m, struct ip6_hdr *); pass: error = nd6_output(rt->rt_ifp, origifp, m, dst, rt); if (error) { in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard); V_ip6stat.ip6s_cantforward++; } else { V_ip6stat.ip6s_forward++; in6_ifstat_inc(rt->rt_ifp, ifs6_out_forward); if (type) V_ip6stat.ip6s_redirectsent++; else { if (mcopy) goto freecopy; } } senderr: if (mcopy == NULL) return; switch (error) { case 0: if (type == ND_REDIRECT) { icmp6_redirect_output(mcopy, rt); return; } goto freecopy; case EMSGSIZE: /* xxx MTU is constant in PPP? */ goto freecopy; case ENOBUFS: /* Tell source to slow down like source quench in IP? */ goto freecopy; case ENETUNREACH: /* shouldn't happen, checked above */ case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP6_DST_UNREACH; code = ICMP6_DST_UNREACH_ADDR; break; } icmp6_error(mcopy, type, code, 0); return; freecopy: m_freem(mcopy); return; } Index: head/sys/netinet6/ip6_input.c =================================================================== --- head/sys/netinet6/ip6_input.c (revision 185087) +++ head/sys/netinet6/ip6_input.c (revision 185088) @@ -1,1593 +1,1672 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #include #endif /* IPSEC */ #include extern struct domain inet6domain; u_char ip6_protox[IPPROTO_MAX]; static struct ifqueue ip6intrq; -static int ip6qmaxlen = IFQ_MAXLEN; + +#ifdef VIMAGE_GLOBALS +static int ip6qmaxlen; struct in6_ifaddr *in6_ifaddr; +struct ip6stat ip6stat; +#endif extern struct callout in6_tmpaddrtimer_ch; +extern int dad_init; +extern int pmtu_expire; +extern int pmtu_probe; +extern u_long rip6_sendspace; +extern u_long rip6_recvspace; +extern int icmp6errppslim; +extern int icmp6_nodeinfo; +extern int udp6_sendspace; +extern int udp6_recvspace; + +#ifdef VIMAGE_GLOBALS int ip6_forward_srcrt; /* XXX */ int ip6_sourcecheck; /* XXX */ int ip6_sourcecheck_interval; /* XXX */ - int ip6_ours_check_algorithm; +#endif struct pfil_head inet6_pfil_hook; -struct ip6stat ip6stat; - static void ip6_init2(void *); static struct ip6aux *ip6_setdstifaddr(struct mbuf *, struct in6_ifaddr *); static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); #ifdef PULLDOWN_TEST static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int); #endif /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. */ void ip6_init(void) { INIT_VNET_INET6(curvnet); struct ip6protosw *pr; int i; + + V_ip6qmaxlen = IFQ_MAXLEN; + V_in6_maxmtu = 0; +#ifdef IP6_AUTO_LINKLOCAL + V_ip6_auto_linklocal = IP6_AUTO_LINKLOCAL; +#else + V_ip6_auto_linklocal = 1; /* enable by default */ +#endif + +#ifndef IPV6FORWARDING +#ifdef GATEWAY6 +#define IPV6FORWARDING 1 /* forward IP6 packets not for us */ +#else +#define IPV6FORWARDING 0 /* don't forward IP6 packets not for us */ +#endif /* GATEWAY6 */ +#endif /* !IPV6FORWARDING */ + +#ifndef IPV6_SENDREDIRECTS +#define IPV6_SENDREDIRECTS 1 +#endif + + V_ip6_forwarding = IPV6FORWARDING; /* act as router? */ + V_ip6_sendredirects = IPV6_SENDREDIRECTS; + V_ip6_defhlim = IPV6_DEFHLIM; + V_ip6_defmcasthlim = IPV6_DEFAULT_MULTICAST_HOPS; + V_ip6_accept_rtadv = 0; /* "IPV6FORWARDING ? 0 : 1" is dangerous */ + V_ip6_log_interval = 5; + V_ip6_hdrnestlimit = 15; /* How many header options will we process? */ + V_ip6_dad_count = 1; /* DupAddrDetectionTransmits */ + V_ip6_auto_flowlabel = 1; + V_ip6_use_deprecated = 1;/* allow deprecated addr (RFC2462 5.5.4) */ + V_ip6_rr_prune = 5; /* router renumbering prefix + * walk list every 5 sec. */ + V_ip6_mcast_pmtu = 0; /* enable pMTU discovery for multicast? */ + V_ip6_v6only = 1; + V_ip6_keepfaith = 0; + V_ip6_log_time = (time_t)0L; +#ifdef IPSTEALTH + V_ip6stealth = 0; +#endif + V_nd6_onlink_ns_rfc4861 = 0; /* allow 'on-link' nd6 NS (RFC 4861) */ + + V_pmtu_expire = 60*10; + V_pmtu_probe = 60*2; + + /* raw IP6 parameters */ + /* + * Nominal space allocated to a raw ip socket. + */ +#define RIPV6SNDQ 8192 +#define RIPV6RCVQ 8192 + V_rip6_sendspace = RIPV6SNDQ; + V_rip6_recvspace = RIPV6RCVQ; + + /* ICMPV6 parameters */ + V_icmp6_rediraccept = 1; /* accept and process redirects */ + V_icmp6_redirtimeout = 10 * 60; /* 10 minutes */ + V_icmp6errppslim = 100; /* 100pps */ + /* control how to respond to NI queries */ + V_icmp6_nodeinfo = (ICMP6_NODEINFO_FQDNOK|ICMP6_NODEINFO_NODEADDROK); + + /* UDP on IP6 parameters */ + V_udp6_sendspace = 9216; /* really max datagram size */ + V_udp6_recvspace = 40 * (1024 + sizeof(struct sockaddr_in6)); + /* 40 1K datagrams */ + V_dad_init = 0; #ifdef DIAGNOSTIC if (sizeof(struct protosw) != sizeof(struct ip6protosw)) panic("sizeof(protosw) != sizeof(ip6protosw)"); #endif pr = (struct ip6protosw *)pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == 0) panic("ip6_init"); /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip6_protox[i] = pr - inet6sw; /* * Cycle through IP protocols and put them into the appropriate place * in ip6_protox[]. */ for (pr = (struct ip6protosw *)inet6domain.dom_protosw; pr < (struct ip6protosw *)inet6domain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET6 && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip6_protox[pr->pr_protocol] = pr - inet6sw; } /* Initialize packet filter hooks. */ inet6_pfil_hook.ph_type = PFIL_TYPE_AF; inet6_pfil_hook.ph_af = AF_INET6; if ((i = pfil_head_register(&inet6_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); ip6intrq.ifq_maxlen = V_ip6qmaxlen; mtx_init(&ip6intrq.ifq_mtx, "ip6_inq", NULL, MTX_DEF); netisr_register(NETISR_IPV6, ip6_input, &ip6intrq, 0); scope6_init(); addrsel_policy_init(); nd6_init(); frag6_init(); V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; } static void ip6_init2(void *dummy) { INIT_VNET_INET6(curvnet); /* nd6_timer_init */ callout_init(&V_nd6_timer_ch, 0); callout_reset(&V_nd6_timer_ch, hz, nd6_timer, NULL); /* timer for regeneranation of temporary addresses randomize ID */ callout_init(&V_in6_tmpaddrtimer_ch, 0); callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, NULL); } /* cheat */ /* This must be after route_init(), which is now SI_ORDER_THIRD */ SYSINIT(netinet6init2, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ip6_init2, NULL); extern struct route_in6 ip6_forward_rt; void ip6_input(struct mbuf *m) { INIT_VNET_NET(curvnet); INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6; int off = sizeof(struct ip6_hdr), nest; u_int32_t plen; u_int32_t rtalert = ~0; int nxt, ours = 0; struct ifnet *deliverifp = NULL; struct in6_addr odst; int srcrt = 0; #ifdef IPSEC /* * should the inner packet be considered authentic? * see comment in ah4_input(). * NB: m cannot be NULL when passed to the input routine */ m->m_flags &= ~M_AUTHIPHDR; m->m_flags &= ~M_AUTHIPDGM; #endif /* IPSEC */ /* * make sure we don't have onion peering information into m_tag. */ ip6_delaux(m); /* * mbuf statistics */ if (m->m_flags & M_EXT) { if (m->m_next) V_ip6stat.ip6s_mext2m++; else V_ip6stat.ip6s_mext1++; } else { #define M2MMAX (sizeof(V_ip6stat.ip6s_m2m)/sizeof(V_ip6stat.ip6s_m2m[0])) if (m->m_next) { if (m->m_flags & M_LOOP) { V_ip6stat.ip6s_m2m[V_loif[0].if_index]++; /* XXX */ } else if (m->m_pkthdr.rcvif->if_index < M2MMAX) V_ip6stat.ip6s_m2m[m->m_pkthdr.rcvif->if_index]++; else V_ip6stat.ip6s_m2m[0]++; } else V_ip6stat.ip6s_m1++; #undef M2MMAX } /* drop the packet if IPv6 operation is disabled on the IF */ if ((ND_IFINFO(m->m_pkthdr.rcvif)->flags & ND6_IFF_IFDISABLED)) { m_freem(m); return; } in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive); V_ip6stat.ip6s_total++; #ifndef PULLDOWN_TEST /* * L2 bridge code and some other code can return mbuf chain * that does not conform to KAME requirement. too bad. * XXX: fails to join if interface MTU > MCLBYTES. jumbogram? */ if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) { struct mbuf *n; MGETHDR(n, M_DONTWAIT, MT_HEADER); if (n) M_MOVE_PKTHDR(n, m); if (n && n->m_pkthdr.len > MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; } } if (n == NULL) { m_freem(m); return; /* ENOBUFS */ } m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t)); n->m_len = n->m_pkthdr.len; m_freem(m); m = n; } IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */); #endif if (m->m_len < sizeof(struct ip6_hdr)) { struct ifnet *inifp; inifp = m->m_pkthdr.rcvif; if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { V_ip6stat.ip6s_toosmall++; in6_ifstat_inc(inifp, ifs6_in_hdrerr); return; } } ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { V_ip6stat.ip6s_badvers++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); goto bad; } V_ip6stat.ip6s_nxthist[ip6->ip6_nxt]++; /* * Check against address spoofing/corruption. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) { /* * XXX: "badscope" is not very suitable for a multicast source. */ V_ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) && !(m->m_flags & M_LOOP)) { /* * In this case, the packet should come from the loopback * interface. However, we cannot just check the if_flags, * because ip6_mloopback() passes the "actual" interface * as the outgoing/incoming interface. */ V_ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) { /* packet is dropped by traffic conditioner */ return; } #endif /* * The following check is not documented in specs. A malicious * party may be able to use IPv4 mapped addr to confuse tcp/udp stack * and bypass security checks (act as if it was from 127.0.0.1 by using * IPv6 src ::ffff:127.0.0.1). Be cautious. * * This check chokes if we are in an SIIT cloud. As none of BSDs * support IPv4-less kernel compilation, we cannot support SIIT * environment at all. So, it makes more sense for us to reject any * malicious packets for non-SIIT environment, than try to do a * partial support for SIIT environment. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { V_ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #if 0 /* * Reject packets with IPv4 compatible addresses (auto tunnel). * * The code forbids auto tunnel relay case in RFC1933 (the check is * stronger than RFC1933). We may want to re-enable it if mech-xx * is revised to forbid relaying case. */ if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { V_ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #endif /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing * (e.g. by NAT rewriting). When this happens, * tell ip6_forward to do the right thing. */ odst = ip6->ip6_dst; /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&inet6_pfil_hook)) goto passin; if (pfil_run_hooks(&inet6_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL)) return; if (m == NULL) /* consumed by filter */ return; ip6 = mtod(m, struct ip6_hdr *); srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst); passin: /* * Disambiguate address scope zones (if there is ambiguity). * We first make sure that the original source or destination address * is not in our internal form for scoped addresses. Such addresses * are not necessarily invalid spec-wise, but we cannot accept them due * to the usage conflict. * in6_setscope() then also checks and rejects the cases where src or * dst are the loopback address and the receiving interface * is not loopback. */ if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) { V_ip6stat.ip6s_badscope++; /* XXX */ goto bad; } if (in6_setscope(&ip6->ip6_src, m->m_pkthdr.rcvif, NULL) || in6_setscope(&ip6->ip6_dst, m->m_pkthdr.rcvif, NULL)) { V_ip6stat.ip6s_badscope++; goto bad; } /* * Multicast check */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct in6_multi *in6m = 0; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mcast); /* * See if we belong to the destination multicast group on the * arrival interface. */ IN6_LOOKUP_MULTI(ip6->ip6_dst, m->m_pkthdr.rcvif, in6m); if (in6m) ours = 1; else if (!ip6_mrouter) { V_ip6stat.ip6s_notmember++; V_ip6stat.ip6s_cantforward++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto bad; } deliverifp = m->m_pkthdr.rcvif; goto hbhcheck; } /* * Unicast check */ if (V_ip6_forward_rt.ro_rt != NULL && (V_ip6_forward_rt.ro_rt->rt_flags & RTF_UP) != 0 && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &((struct sockaddr_in6 *)(&V_ip6_forward_rt.ro_dst))->sin6_addr)) V_ip6stat.ip6s_forward_cachehit++; else { struct sockaddr_in6 *dst6; if (V_ip6_forward_rt.ro_rt) { /* route is down or destination is different */ V_ip6stat.ip6s_forward_cachemiss++; RTFREE(V_ip6_forward_rt.ro_rt); V_ip6_forward_rt.ro_rt = 0; } bzero(&V_ip6_forward_rt.ro_dst, sizeof(struct sockaddr_in6)); dst6 = (struct sockaddr_in6 *)&V_ip6_forward_rt.ro_dst; dst6->sin6_len = sizeof(struct sockaddr_in6); dst6->sin6_family = AF_INET6; dst6->sin6_addr = ip6->ip6_dst; rtalloc((struct route *)&V_ip6_forward_rt); } #define rt6_key(r) ((struct sockaddr_in6 *)((r)->rt_nodes->rn_key)) /* * Accept the packet if the forwarding interface to the destination * according to the routing table is the loopback interface, * unless the associated route has a gateway. * Note that this approach causes to accept a packet if there is a * route to the loopback interface for the destination of the packet. * But we think it's even useful in some situations, e.g. when using * a special daemon which wants to intercept the packet. * * XXX: some OSes automatically make a cloned route for the destination * of an outgoing packet. If the outgoing interface of the packet * is a loopback one, the kernel would consider the packet to be * accepted, even if we have no such address assinged on the interface. * We check the cloned flag of the route entry to reject such cases, * assuming that route entries for our own addresses are not made by * cloning (it should be true because in6_addloop explicitly installs * the host route). However, we might have to do an explicit check * while it would be less efficient. Or, should we rather install a * reject route for such a case? */ if (V_ip6_forward_rt.ro_rt && (V_ip6_forward_rt.ro_rt->rt_flags & (RTF_HOST|RTF_GATEWAY)) == RTF_HOST && #ifdef RTF_WASCLONED !(V_ip6_forward_rt.ro_rt->rt_flags & RTF_WASCLONED) && #endif #ifdef RTF_CLONED !(V_ip6_forward_rt.ro_rt->rt_flags & RTF_CLONED) && #endif #if 0 /* * The check below is redundant since the comparison of * the destination and the key of the rtentry has * already done through looking up the routing table. */ IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &rt6_key(V_ip6_forward_rt.ro_rt)->sin6_addr) #endif V_ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_LOOP) { struct in6_ifaddr *ia6 = (struct in6_ifaddr *)V_ip6_forward_rt.ro_rt->rt_ifa; /* * record address information into m_tag. */ (void)ip6_setdstifaddr(m, ia6); /* * packets to a tentative, duplicated, or somehow invalid * address must not be accepted. */ if (!(ia6->ia6_flags & IN6_IFF_NOTREADY)) { /* this address is ready */ ours = 1; deliverifp = ia6->ia_ifp; /* correct? */ /* Count the packet in the ip address stats */ ia6->ia_ifa.if_ipackets++; ia6->ia_ifa.if_ibytes += m->m_pkthdr.len; goto hbhcheck; } else { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; /* address is not ready, so discard the packet. */ nd6log((LOG_INFO, "ip6_input: packet to an unready address %s->%s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst))); goto bad; } } /* * FAITH (Firewall Aided Internet Translator) */ if (V_ip6_keepfaith) { if (V_ip6_forward_rt.ro_rt && V_ip6_forward_rt.ro_rt->rt_ifp && V_ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_FAITH) { /* XXX do we need more sanity checks? */ ours = 1; deliverifp = V_ip6_forward_rt.ro_rt->rt_ifp; /* faith */ goto hbhcheck; } } /* * Now there is no reason to process the packet if it's not our own * and we're not a router. */ if (!V_ip6_forwarding) { V_ip6stat.ip6s_cantforward++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto bad; } hbhcheck: /* * record address information into m_tag, if we don't have one yet. * note that we are unable to record it, if the address is not listed * as our interface address (e.g. multicast addresses, addresses * within FAITH prefixes and such). */ if (deliverifp && !ip6_getdstifaddr(m)) { struct in6_ifaddr *ia6; ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst); if (ia6) { if (!ip6_setdstifaddr(m, ia6)) { /* * XXX maybe we should drop the packet here, * as we could not provide enough information * to the upper layers. */ } } } /* * Process Hop-by-Hop options header if it's contained. * m may be modified in ip6_hopopts_input(). * If a JumboPayload option is included, plen will also be modified. */ plen = (u_int32_t)ntohs(ip6->ip6_plen); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; if (ip6_hopopts_input(&plen, &rtalert, &m, &off)) { #if 0 /*touches NULL pointer*/ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); #endif return; /* m have already been freed */ } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* * if the payload length field is 0 and the next header field * indicates Hop-by-Hop Options header, then a Jumbo Payload * option MUST be included. */ if (ip6->ip6_plen == 0 && plen == 0) { /* * Note that if a valid jumbo payload option is * contained, ip6_hopopts_input() must set a valid * (non-zero) payload length to the variable plen. */ V_ip6stat.ip6s_badoptions++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&ip6->ip6_plen - (caddr_t)ip6); return; } #ifndef PULLDOWN_TEST /* ip6_hopopts_input() ensures that mbuf is contiguous */ hbh = (struct ip6_hbh *)(ip6 + 1); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { V_ip6stat.ip6s_tooshort++; return; } #endif nxt = hbh->ip6h_nxt; /* * If we are acting as a router and the packet contains a * router alert option, see if we know the option value. * Currently, we only support the option value for MLD, in which * case we should pass the packet to the multicast routing * daemon. */ if (rtalert != ~0 && V_ip6_forwarding) { switch (rtalert) { case IP6OPT_RTALERT_MLD: ours = 1; break; default: /* * RFC2711 requires unrecognized values must be * silently ignored. */ break; } } } else nxt = ip6->ip6_nxt; /* * Check that the amount of data in the buffers * is as at least much as the IPv6 header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) { V_ip6stat.ip6s_tooshort++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) { if (m->m_len == m->m_pkthdr.len) { m->m_len = sizeof(struct ip6_hdr) + plen; m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen; } else m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len); } /* * Forward if desirable. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip6_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip6_mrouter && ip6_mforward && ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) { V_ip6stat.ip6s_cantforward++; m_freem(m); return; } if (!ours) { m_freem(m); return; } } else if (!ours) { ip6_forward(m, srcrt); return; } ip6 = mtod(m, struct ip6_hdr *); /* * Malicious party may be able to use IPv4 mapped addr to confuse * tcp/udp stack and bypass security checks (act as if it was from * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1). Be cautious. * * For SIIT end node behavior, you may want to disable the check. * However, you will become vulnerable to attacks using IPv4 mapped * source. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { V_ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } /* * Tell launch routine the next header */ V_ip6stat.ip6s_delivered++; in6_ifstat_inc(deliverifp, ifs6_in_deliver); nest = 0; while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { V_ip6stat.ip6s_toomanyhdr++; goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { V_ip6stat.ip6s_tooshort++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } #ifdef IPSEC /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if (ip6_ipsec_input(m, nxt)) goto bad; #endif /* IPSEC */ nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt); } return; bad: m_freem(m); } /* * set/grab in6_ifaddr correspond to IPv6 destination address. * XXX backward compatibility wrapper */ static struct ip6aux * ip6_setdstifaddr(struct mbuf *m, struct in6_ifaddr *ia6) { struct ip6aux *ip6a; ip6a = ip6_addaux(m); if (ip6a) ip6a->ip6a_dstia6 = ia6; return ip6a; /* NULL if failed to set */ } struct in6_ifaddr * ip6_getdstifaddr(struct mbuf *m) { struct ip6aux *ip6a; ip6a = ip6_findaux(m); if (ip6a) return ip6a->ip6a_dstia6; else return NULL; } /* * Hop-by-Hop options header processing. If a valid jumbo payload option is * included, the real payload length will be stored in plenp. * * rtalertp - XXX: should be stored more smart way */ static int ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp, struct mbuf **mp, int *offp) { INIT_VNET_INET6(curvnet); struct mbuf *m = *mp; int off = *offp, hbhlen; struct ip6_hbh *hbh; u_int8_t *opt; /* validation of the length of the header */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_CHECK(m, off, hbhlen, -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { V_ip6stat.ip6s_tooshort++; return -1; } hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), hbhlen); if (hbh == NULL) { V_ip6stat.ip6s_tooshort++; return -1; } #endif off += hbhlen; hbhlen -= sizeof(struct ip6_hbh); opt = (u_int8_t *)hbh + sizeof(struct ip6_hbh); if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh), hbhlen, rtalertp, plenp) < 0) return (-1); *offp = off; *mp = m; return (0); } /* * Search header for all Hop-by-hop options and process each option. * This function is separate from ip6_hopopts_input() in order to * handle a case where the sending node itself process its hop-by-hop * options header. In such a case, the function is called from ip6_output(). * * The function assumes that hbh header is located right after the IPv6 header * (RFC2460 p7), opthead is pointer into data content in m, and opthead to * opthead + hbhlen is located in continuous memory region. */ int ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen, u_int32_t *rtalertp, u_int32_t *plenp) { INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6; int optlen = 0; u_int8_t *opt = opthead; u_int16_t rtalert_val; u_int32_t jumboplen; const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh); for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) { switch (*opt) { case IP6OPT_PAD1: optlen = 1; break; case IP6OPT_PADN: if (hbhlen < IP6OPT_MINLEN) { V_ip6stat.ip6s_toosmall++; goto bad; } optlen = *(opt + 1) + 2; break; case IP6OPT_ROUTER_ALERT: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_RTALERT_LEN) { V_ip6stat.ip6s_toosmall++; goto bad; } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_RTALERT_LEN; bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2); *rtalertp = ntohs(rtalert_val); break; case IP6OPT_JUMBO: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_JUMBO_LEN) { V_ip6stat.ip6s_toosmall++; goto bad; } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_JUMBO_LEN; /* * IPv6 packets that have non 0 payload length * must not contain a jumbo payload option. */ ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { V_ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); return (-1); } /* * We may see jumbolen in unaligned location, so * we'd need to perform bcopy(). */ bcopy(opt + 2, &jumboplen, sizeof(jumboplen)); jumboplen = (u_int32_t)htonl(jumboplen); #if 1 /* * if there are multiple jumbo payload options, * *plenp will be non-zero and the packet will be * rejected. * the behavior may need some debate in ipngwg - * multiple options does not make sense, however, * there's no explicit mention in specification. */ if (*plenp != 0) { V_ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } #endif /* * jumbo payload length must be larger than 65535. */ if (jumboplen <= IPV6_MAXPACKET) { V_ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } *plenp = jumboplen; break; default: /* unknown option */ if (hbhlen < IP6OPT_MINLEN) { V_ip6stat.ip6s_toosmall++; goto bad; } optlen = ip6_unknown_opt(opt, m, erroff + opt - opthead); if (optlen == -1) return (-1); optlen += 2; break; } } return (0); bad: m_freem(m); return (-1); } /* * Unknown option processing. * The third argument `off' is the offset from the IPv6 header to the option, * which is necessary if the IPv6 header the and option header and IPv6 header * is not continuous in order to return an ICMPv6 error. */ int ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off) { INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6; switch (IP6OPT_TYPE(*optp)) { case IP6OPT_TYPE_SKIP: /* ignore the option */ return ((int)*(optp + 1)); case IP6OPT_TYPE_DISCARD: /* silently discard */ m_freem(m); return (-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ V_ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ V_ip6stat.ip6s_badoptions++; ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); } m_freem(m); /* XXX: NOTREACHED */ return (-1); } /* * Create the "control" list for this pcb. * These functions will not modify mbuf chain at all. * * With KAME mbuf chain restriction: * The routine will be called from upper layer handlers like tcp6_input(). * Thus the routine assumes that the caller (tcp6_input) have already * called IP6_EXTHDR_CHECK() and all the extension headers are located in the * very first mbuf on the mbuf chain. * * ip6_savecontrol_v4 will handle those options that are possible to be * set on a v4-mapped socket. * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those * options and handle the v6-only ones itself. */ struct mbuf ** ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, int *v4only) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); #ifdef SO_TIMESTAMP if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) { struct timeval tv; microtime(&tv); *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } #endif if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { if (v4only != NULL) *v4only = 1; return (mp); } #define IS2292(inp, x, y) (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y)) /* RFC 2292 sec. 5 */ if ((inp->inp_flags & IN6P_PKTINFO) != 0) { struct in6_pktinfo pi6; bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr)); in6_clearscope(&pi6.ipi6_addr); /* XXX */ pi6.ipi6_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; *mp = sbcreatecontrol((caddr_t) &pi6, sizeof(struct in6_pktinfo), IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) { int hlim = ip6->ip6_hlim & 0xff; *mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int), IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if (v4only != NULL) *v4only = 0; return (mp); } void ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); int v4only = 0; mp = ip6_savecontrol_v4(in6p, m, mp, &v4only); if (v4only) return; if ((in6p->in6p_flags & IN6P_TCLASS) != 0) { u_int32_t flowinfo; int tclass; flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK); flowinfo >>= 20; tclass = flowinfo & 0xff; *mp = sbcreatecontrol((caddr_t) &tclass, sizeof(tclass), IPV6_TCLASS, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } /* * IPV6_HOPOPTS socket option. Recall that we required super-user * privilege for the option (see ip6_ctloutput), but it might be too * strict, since there might be some hop-by-hop options which can be * returned to normal user. * See also RFC 2292 section 6 (or RFC 3542 section 8). */ if ((in6p->in6p_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary * data. Note that a hop-by-hop options header must be * just after the IPv6 header, which is assured through the * IPv6 input processing. */ if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; int hbhlen = 0; #ifdef PULLDOWN_TEST struct mbuf *ext; #endif #ifndef PULLDOWN_TEST hbh = (struct ip6_hbh *)(ip6 + 1); hbhlen = (hbh->ip6h_len + 1) << 3; #else ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr), ip6->ip6_nxt); if (ext == NULL) { V_ip6stat.ip6s_tooshort++; return; } hbh = mtod(ext, struct ip6_hbh *); hbhlen = (hbh->ip6h_len + 1) << 3; if (hbhlen != ext->m_len) { m_freem(ext); V_ip6stat.ip6s_tooshort++; return; } #endif /* * XXX: We copy the whole header even if a * jumbo payload option is included, the option which * is to be removed before returning according to * RFC2292. * Note: this constraint is removed in RFC3542 */ *mp = sbcreatecontrol((caddr_t)hbh, hbhlen, IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; #ifdef PULLDOWN_TEST m_freem(ext); #endif } } if ((in6p->in6p_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* * Search for destination options headers or routing * header(s) through the header chain, and stores each * header as ancillary data. * Note that the order of the headers remains in * the chain of ancillary data. */ while (1) { /* is explicit loop prevention necessary? */ struct ip6_ext *ip6e = NULL; int elen; #ifdef PULLDOWN_TEST struct mbuf *ext = NULL; #endif /* * if it is not an extension header, don't try to * pull it from the chain. */ switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: goto loopend; } #ifndef PULLDOWN_TEST if (off + sizeof(*ip6e) > m->m_len) goto loopend; ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (off + elen > m->m_len) goto loopend; #else ext = ip6_pullexthdr(m, off, nxt); if (ext == NULL) { V_ip6stat.ip6s_tooshort++; return; } ip6e = mtod(ext, struct ip6_ext *); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (elen != ext->m_len) { m_freem(ext); V_ip6stat.ip6s_tooshort++; return; } #endif switch (nxt) { case IPPROTO_DSTOPTS: if (!(in6p->in6p_flags & IN6P_DSTOPTS)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IS2292(in6p, IPV6_2292DSTOPTS, IPV6_DSTOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_ROUTING: if (!in6p->in6p_flags & IN6P_RTHDR) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: /* * other cases have been filtered in the above. * none will visit this case. here we supply * the code just in case (nxt overwritten or * other cases). */ #ifdef PULLDOWN_TEST m_freem(ext); #endif goto loopend; } /* proceed with the next header. */ off += elen; nxt = ip6e->ip6e_nxt; ip6e = NULL; #ifdef PULLDOWN_TEST m_freem(ext); ext = NULL; #endif } loopend: ; } } #undef IS2292 void ip6_notify_pmtu(struct inpcb *in6p, struct sockaddr_in6 *dst, u_int32_t *mtu) { struct socket *so; struct mbuf *m_mtu; struct ip6_mtuinfo mtuctl; so = in6p->inp_socket; if (mtu == NULL) return; #ifdef DIAGNOSTIC if (so == NULL) /* I believe this is impossible */ panic("ip6_notify_pmtu: socket is NULL"); #endif bzero(&mtuctl, sizeof(mtuctl)); /* zero-clear for safety */ mtuctl.ip6m_mtu = *mtu; mtuctl.ip6m_addr = *dst; if (sa6_recoverscope(&mtuctl.ip6m_addr)) return; if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl), IPV6_PATHMTU, IPPROTO_IPV6)) == NULL) return; if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu) == 0) { m_freem(m_mtu); /* XXX: should count statistics */ } else sorwakeup(so); return; } #ifdef PULLDOWN_TEST /* * pull single extension header from mbuf chain. returns single mbuf that * contains the result, or NULL on error. */ static struct mbuf * ip6_pullexthdr(struct mbuf *m, size_t off, int nxt) { struct ip6_ext ip6e; size_t elen; struct mbuf *n; #ifdef DIAGNOSTIC switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: printf("ip6_pullexthdr: invalid nxt=%d\n", nxt); } #endif m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxt == IPPROTO_AH) elen = (ip6e.ip6e_len + 2) << 2; else elen = (ip6e.ip6e_len + 1) << 3; MGET(n, M_DONTWAIT, MT_DATA); if (n && elen >= MLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (!n) return NULL; n->m_len = 0; if (elen >= M_TRAILINGSPACE(n)) { m_free(n); return NULL; } m_copydata(m, off, elen, mtod(n, caddr_t)); n->m_len = elen; return n; } #endif /* * Get pointer to the previous header followed by the header * currently processed. * XXX: This function supposes that * M includes all headers, * the next header field and the header length field of each header * are valid, and * the sum of each header length equals to OFF. * Because of these assumptions, this function must be called very * carefully. Moreover, it will not be used in the near future when * we develop `neater' mechanism to process extension headers. */ char * ip6_get_prevhdr(struct mbuf *m, int off) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); if (off == sizeof(struct ip6_hdr)) return (&ip6->ip6_nxt); else { int len, nxt; struct ip6_ext *ip6e = NULL; nxt = ip6->ip6_nxt; len = sizeof(struct ip6_hdr); while (len < off) { ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + len); switch (nxt) { case IPPROTO_FRAGMENT: len += sizeof(struct ip6_frag); break; case IPPROTO_AH: len += (ip6e->ip6e_len + 2) << 2; break; default: len += (ip6e->ip6e_len + 1) << 3; break; } nxt = ip6e->ip6e_nxt; } if (ip6e) return (&ip6e->ip6e_nxt); else return NULL; } } /* * get next header offset. m will be retained. */ int ip6_nexthdr(struct mbuf *m, int off, int proto, int *nxtp) { struct ip6_hdr ip6; struct ip6_ext ip6e; struct ip6_frag fh; /* just in case */ if (m == NULL) panic("ip6_nexthdr: m == NULL"); if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off) return -1; switch (proto) { case IPPROTO_IPV6: if (m->m_pkthdr.len < off + sizeof(ip6)) return -1; m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6); if (nxtp) *nxtp = ip6.ip6_nxt; off += sizeof(ip6); return off; case IPPROTO_FRAGMENT: /* * terminate parsing if it is not the first fragment, * it does not make sense to parse through it. */ if (m->m_pkthdr.len < off + sizeof(fh)) return -1; m_copydata(m, off, sizeof(fh), (caddr_t)&fh); /* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */ if (fh.ip6f_offlg & IP6F_OFF_MASK) return -1; if (nxtp) *nxtp = fh.ip6f_nxt; off += sizeof(struct ip6_frag); return off; case IPPROTO_AH: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 2) << 2; return off; case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 1) << 3; return off; case IPPROTO_NONE: case IPPROTO_ESP: case IPPROTO_IPCOMP: /* give up */ return -1; default: return -1; } return -1; } /* * get offset for the last header in the chain. m will be kept untainted. */ int ip6_lasthdr(struct mbuf *m, int off, int proto, int *nxtp) { int newoff; int nxt; if (!nxtp) { nxt = -1; nxtp = &nxt; } while (1) { newoff = ip6_nexthdr(m, off, proto, nxtp); if (newoff < 0) return off; else if (newoff < off) return -1; /* invalid */ else if (newoff == off) return newoff; off = newoff; proto = *nxtp; } } struct ip6aux * ip6_addaux(struct mbuf *m) { struct m_tag *mtag; mtag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL); if (!mtag) { mtag = m_tag_get(PACKET_TAG_IPV6_INPUT, sizeof(struct ip6aux), M_NOWAIT); if (mtag) { m_tag_prepend(m, mtag); bzero(mtag + 1, sizeof(struct ip6aux)); } } return mtag ? (struct ip6aux *)(mtag + 1) : NULL; } struct ip6aux * ip6_findaux(struct mbuf *m) { struct m_tag *mtag; mtag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL); return mtag ? (struct ip6aux *)(mtag + 1) : NULL; } void ip6_delaux(struct mbuf *m) { struct m_tag *mtag; mtag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL); if (mtag) m_tag_delete(m, mtag); } /* * System control for IP6 */ u_char inet6ctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, 0, 0, ENOPROTOOPT }; Index: head/sys/netinet6/ip6_mroute.c =================================================================== --- head/sys/netinet6/ip6_mroute.c (revision 185087) +++ head/sys/netinet6/ip6_mroute.c (revision 185088) @@ -1,1930 +1,1949 @@ /*- * Copyright (C) 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_mroute.c,v 1.58 2001/12/18 02:36:31 itojun Exp $ */ /*- * Copyright (c) 1989 Stephen Deering * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 * BSDI ip_mroute.c,v 2.10 1996/11/14 00:29:52 jch Exp */ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenner, PARC, April 1994 * * MROUTING Revision: 3.5.1.2 + PIM-SMv2 (pimd) Support */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_MRTABLE6, "mf6c", "multicast forwarding cache entry"); /* XXX: this is a very common idiom; move to ? */ #define M_HASCL(m) ((m)->m_flags & M_EXT) static int ip6_mdq(struct mbuf *, struct ifnet *, struct mf6c *); static void phyint_send(struct ip6_hdr *, struct mif6 *, struct mbuf *); +static void pim6_init(void); static int set_pim6(int *); static int socket_send __P((struct socket *, struct mbuf *, struct sockaddr_in6 *)); static int register_send __P((struct ip6_hdr *, struct mif6 *, struct mbuf *)); extern struct domain inet6domain; /* XXX: referenced from ip_mroute.c for dynamically loading this code. */ struct ip6protosw in6_pim_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = pim6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, + .pr_init = pim6_init, .pr_usrreqs = &rip6_usrreqs }; -static int ip6_mrouter_ver = 0; +#ifdef VIMAGE_GLOBALS +static int ip6_mrouter_ver; +#endif SYSCTL_DECL(_net_inet6); SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); static struct mrt6stat mrt6stat; SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RW, &mrt6stat, mrt6stat, "Multicast Routing Statistics (struct mrt6stat, netinet6/ip6_mroute.h)"); #define NO_RTE_FOUND 0x1 #define RTE_FOUND 0x2 static struct mf6c *mf6ctable[MF6CTBLSIZ]; SYSCTL_OPAQUE(_net_inet6_ip6, OID_AUTO, mf6ctable, CTLFLAG_RD, &mf6ctable, sizeof(mf6ctable), "S,*mf6ctable[MF6CTBLSIZ]", "Multicast Forwarding Table (struct *mf6ctable[MF6CTBLSIZ], " "netinet6/ip6_mroute.h)"); static u_char n6expire[MF6CTBLSIZ]; static struct mif6 mif6table[MAXMIFS]; SYSCTL_OPAQUE(_net_inet6_ip6, OID_AUTO, mif6table, CTLFLAG_RD, &mif6table, sizeof(mif6table), "S,vif[MAXMIFS]", "Multicast Interfaces (struct mif[MAXMIFS], netinet6/ip6_mroute.h)"); #ifdef MRT6DEBUG +#ifdef VIMAGE_GLOBALS static u_int mrt6debug = 0; /* debug level */ +#endif #define DEBUG_MFC 0x02 #define DEBUG_FORWARD 0x04 #define DEBUG_EXPIRE 0x08 #define DEBUG_XMIT 0x10 #define DEBUG_REG 0x20 #define DEBUG_PIM 0x40 #endif static void expire_upcalls(void *); #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ #ifdef INET #ifdef MROUTING extern struct socket *ip_mrouter; #endif #endif /* * 'Interfaces' associated with decapsulator (so we can tell * packets that went through it from ones that get reflected * by a broken gateway). Different from IPv4 register_if, * these interfaces are linked into the system ifnet list, * because per-interface IPv6 statistics are maintained in * ifp->if_afdata. But it does not have any routes point * to them. I.e., packets can't be sent this way. They * only exist as a placeholder for multicast source * verification. */ static struct ifnet *multicast_register_if6; #define ENCAP_HOPS 64 /* * Private variables. */ static mifi_t nummifs = 0; static mifi_t reg_mif_num = (mifi_t)-1; static struct pim6stat pim6stat; SYSCTL_STRUCT(_net_inet6_pim, PIM6CTL_STATS, stats, CTLFLAG_RD, &pim6stat, pim6stat, "PIM Statistics (struct pim6stat, netinet6/pim_var.h)"); +#ifdef VIMAGE_GLOBALS static int pim6; +#endif /* * Hash function for a source, group entry */ #define MF6CHASH(a, g) MF6CHASHMOD((a).s6_addr32[0] ^ (a).s6_addr32[1] ^ \ (a).s6_addr32[2] ^ (a).s6_addr32[3] ^ \ (g).s6_addr32[0] ^ (g).s6_addr32[1] ^ \ (g).s6_addr32[2] ^ (g).s6_addr32[3]) /* * Find a route for a given origin IPv6 address and Multicast group address. */ #define MF6CFIND(o, g, rt) do { \ struct mf6c *_rt = mf6ctable[MF6CHASH(o,g)]; \ rt = NULL; \ mrt6stat.mrt6s_mfc_lookups++; \ while (_rt) { \ if (IN6_ARE_ADDR_EQUAL(&_rt->mf6c_origin.sin6_addr, &(o)) && \ IN6_ARE_ADDR_EQUAL(&_rt->mf6c_mcastgrp.sin6_addr, &(g)) && \ (_rt->mf6c_stall == NULL)) { \ rt = _rt; \ break; \ } \ _rt = _rt->mf6c_next; \ } \ if (rt == NULL) { \ mrt6stat.mrt6s_mfc_misses++; \ } \ } while (/*CONSTCOND*/ 0) /* * Macros to compute elapsed time efficiently * Borrowed from Van Jacobson's scheduling code * XXX: replace with timersub() ? */ #define TV_DELTA(a, b, delta) do { \ int xxs; \ \ delta = (a).tv_usec - (b).tv_usec; \ if ((xxs = (a).tv_sec - (b).tv_sec)) { \ switch (xxs) { \ case 2: \ delta += 1000000; \ /* FALLTHROUGH */ \ case 1: \ delta += 1000000; \ break; \ default: \ delta += (1000000 * xxs); \ } \ } \ } while (/*CONSTCOND*/ 0) /* XXX: replace with timercmp(a, b, <) ? */ #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) #ifdef UPCALL_TIMING #define UPCALL_MAX 50 static u_long upcall_data[UPCALL_MAX + 1]; static void collate(); #endif /* UPCALL_TIMING */ static int get_sg_cnt(struct sioc_sg_req6 *); static int get_mif6_cnt(struct sioc_mif_req6 *); static int ip6_mrouter_init(struct socket *, int, int); static int add_m6if(struct mif6ctl *); static int del_m6if(mifi_t *); static int add_m6fc(struct mf6cctl *); static int del_m6fc(struct mf6cctl *); static struct callout expire_upcalls_ch; int X_ip6_mforward(struct ip6_hdr *ip6, struct ifnet *ifp, struct mbuf *m); int X_ip6_mrouter_done(void); int X_ip6_mrouter_set(struct socket *so, struct sockopt *sopt); int X_ip6_mrouter_get(struct socket *so, struct sockopt *sopt); int X_mrt6_ioctl(int cmd, caddr_t data); + +static void +pim6_init(void) +{ + INIT_VNET_INET6(curvnet); + + V_ip6_mrouter_ver = 0; +#ifdef MRT6DEBUG + V_mrt6debug = 0; /* debug level */ +#endif +} /* * Handle MRT setsockopt commands to modify the multicast routing tables. */ int X_ip6_mrouter_set(struct socket *so, struct sockopt *sopt) { int error = 0; int optval; struct mif6ctl mifc; struct mf6cctl mfcc; mifi_t mifi; if (so != ip6_mrouter && sopt->sopt_name != MRT6_INIT) return (EACCES); switch (sopt->sopt_name) { case MRT6_INIT: #ifdef MRT6_OINIT case MRT6_OINIT: #endif error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; error = ip6_mrouter_init(so, optval, sopt->sopt_name); break; case MRT6_DONE: error = X_ip6_mrouter_done(); break; case MRT6_ADD_MIF: error = sooptcopyin(sopt, &mifc, sizeof(mifc), sizeof(mifc)); if (error) break; error = add_m6if(&mifc); break; case MRT6_ADD_MFC: error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc)); if (error) break; error = add_m6fc(&mfcc); break; case MRT6_DEL_MFC: error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc)); if (error) break; error = del_m6fc(&mfcc); break; case MRT6_DEL_MIF: error = sooptcopyin(sopt, &mifi, sizeof(mifi), sizeof(mifi)); if (error) break; error = del_m6if(&mifi); break; case MRT6_PIM: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; error = set_pim6(&optval); break; default: error = EOPNOTSUPP; break; } return (error); } /* * Handle MRT getsockopt commands */ int X_ip6_mrouter_get(struct socket *so, struct sockopt *sopt) { INIT_VNET_INET6(curvnet); int error = 0; if (so != ip6_mrouter) return (EACCES); switch (sopt->sopt_name) { case MRT6_PIM: error = sooptcopyout(sopt, &V_pim6, sizeof(V_pim6)); break; } return (error); } /* * Handle ioctl commands to obtain information from the cache */ int X_mrt6_ioctl(int cmd, caddr_t data) { switch (cmd) { case SIOCGETSGCNT_IN6: return (get_sg_cnt((struct sioc_sg_req6 *)data)); case SIOCGETMIFCNT_IN6: return (get_mif6_cnt((struct sioc_mif_req6 *)data)); default: return (EINVAL); } } /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(struct sioc_sg_req6 *req) { struct mf6c *rt; int s; s = splnet(); MF6CFIND(req->src.sin6_addr, req->grp.sin6_addr, rt); splx(s); if (rt != NULL) { req->pktcnt = rt->mf6c_pkt_cnt; req->bytecnt = rt->mf6c_byte_cnt; req->wrong_if = rt->mf6c_wrong_if; } else return (ESRCH); #if 0 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; #endif return (0); } /* * returns the input and output packet and byte counts on the mif provided */ static int get_mif6_cnt(struct sioc_mif_req6 *req) { mifi_t mifi = req->mifi; if (mifi >= nummifs) return (EINVAL); req->icount = mif6table[mifi].m6_pkt_in; req->ocount = mif6table[mifi].m6_pkt_out; req->ibytes = mif6table[mifi].m6_bytes_in; req->obytes = mif6table[mifi].m6_bytes_out; return (0); } static int set_pim6(int *i) { INIT_VNET_INET6(curvnet); if ((*i != 1) && (*i != 0)) return (EINVAL); V_pim6 = *i; return (0); } /* * Enable multicast routing */ static int ip6_mrouter_init(struct socket *so, int v, int cmd) { INIT_VNET_INET6(curvnet); #ifdef MRT6DEBUG if (V_mrt6debug) log(LOG_DEBUG, "ip6_mrouter_init: so_type = %d, pr_protocol = %d\n", so->so_type, so->so_proto->pr_protocol); #endif if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_ICMPV6) return (EOPNOTSUPP); if (v != 1) return (ENOPROTOOPT); if (ip6_mrouter != NULL) return (EADDRINUSE); ip6_mrouter = so; V_ip6_mrouter_ver = cmd; bzero((caddr_t)mf6ctable, sizeof(mf6ctable)); bzero((caddr_t)n6expire, sizeof(n6expire)); V_pim6 = 0;/* used for stubbing out/in pim stuff */ callout_init(&expire_upcalls_ch, 0); callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); #ifdef MRT6DEBUG if (V_mrt6debug) log(LOG_DEBUG, "ip6_mrouter_init\n"); #endif return (0); } /* * Disable multicast routing */ int X_ip6_mrouter_done(void) { INIT_VNET_INET6(curvnet); mifi_t mifi; int i; struct mf6c *rt; struct rtdetq *rte; int s; s = splnet(); /* * For each phyint in use, disable promiscuous reception of all IPv6 * multicasts. */ #ifdef INET #ifdef MROUTING /* * If there is still IPv4 multicast routing daemon, * we remain interfaces to receive all muliticasted packets. * XXX: there may be an interface in which the IPv4 multicast * daemon is not interested... */ if (!V_ip_mrouter) #endif #endif { for (mifi = 0; mifi < nummifs; mifi++) { if (mif6table[mifi].m6_ifp && !(mif6table[mifi].m6_flags & MIFF_REGISTER)) { if_allmulti(mif6table[mifi].m6_ifp, 0); } } } bzero((caddr_t)mif6table, sizeof(mif6table)); nummifs = 0; V_pim6 = 0; /* used to stub out/in pim specific code */ callout_stop(&expire_upcalls_ch); /* * Free all multicast forwarding cache entries. */ for (i = 0; i < MF6CTBLSIZ; i++) { rt = mf6ctable[i]; while (rt) { struct mf6c *frt; for (rte = rt->mf6c_stall; rte != NULL; ) { struct rtdetq *n = rte->next; m_free(rte->m); free(rte, M_MRTABLE6); rte = n; } frt = rt; rt = rt->mf6c_next; free(frt, M_MRTABLE6); } } bzero((caddr_t)mf6ctable, sizeof(mf6ctable)); /* * Reset register interface */ if (reg_mif_num != (mifi_t)-1 && multicast_register_if6 != NULL) { if_detach(multicast_register_if6); if_free(multicast_register_if6); reg_mif_num = (mifi_t)-1; multicast_register_if6 = NULL; } ip6_mrouter = NULL; V_ip6_mrouter_ver = 0; splx(s); #ifdef MRT6DEBUG if (V_mrt6debug) log(LOG_DEBUG, "ip6_mrouter_done\n"); #endif return (0); } static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 }; /* * Add a mif to the mif table */ static int add_m6if(struct mif6ctl *mifcp) { INIT_VNET_NET(curvnet); struct mif6 *mifp; struct ifnet *ifp; int error, s; if (mifcp->mif6c_mifi >= MAXMIFS) return (EINVAL); mifp = mif6table + mifcp->mif6c_mifi; if (mifp->m6_ifp) return (EADDRINUSE); /* XXX: is it appropriate? */ if (mifcp->mif6c_pifi == 0 || mifcp->mif6c_pifi > V_if_index) return (ENXIO); ifp = ifnet_byindex(mifcp->mif6c_pifi); if (mifcp->mif6c_flags & MIFF_REGISTER) { if (reg_mif_num == (mifi_t)-1) { ifp = if_alloc(IFT_OTHER); if_initname(ifp, "register_mif", 0); ifp->if_flags |= IFF_LOOPBACK; if_attach(ifp); multicast_register_if6 = ifp; reg_mif_num = mifcp->mif6c_mifi; /* * it is impossible to guess the ifindex of the * register interface. So mif6c_pifi is automatically * calculated. */ mifcp->mif6c_pifi = ifp->if_index; } else { ifp = multicast_register_if6; } } /* if REGISTER */ else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); s = splnet(); error = if_allmulti(ifp, 1); splx(s); if (error) return (error); } s = splnet(); mifp->m6_flags = mifcp->mif6c_flags; mifp->m6_ifp = ifp; /* initialize per mif pkt counters */ mifp->m6_pkt_in = 0; mifp->m6_pkt_out = 0; mifp->m6_bytes_in = 0; mifp->m6_bytes_out = 0; splx(s); /* Adjust nummifs up if the mifi is higher than nummifs */ if (nummifs <= mifcp->mif6c_mifi) nummifs = mifcp->mif6c_mifi + 1; #ifdef MRT6DEBUG if (V_mrt6debug) log(LOG_DEBUG, "add_mif #%d, phyint %s\n", mifcp->mif6c_mifi, ifp->if_xname); #endif return (0); } /* * Delete a mif from the mif table */ static int del_m6if(mifi_t *mifip) { struct mif6 *mifp = mif6table + *mifip; mifi_t mifi; struct ifnet *ifp; int s; if (*mifip >= nummifs) return (EINVAL); if (mifp->m6_ifp == NULL) return (EINVAL); s = splnet(); if (!(mifp->m6_flags & MIFF_REGISTER)) { /* * XXX: what if there is yet IPv4 multicast daemon * using the interface? */ ifp = mifp->m6_ifp; if_allmulti(ifp, 0); } else { if (reg_mif_num != (mifi_t)-1 && multicast_register_if6 != NULL) { if_detach(multicast_register_if6); if_free(multicast_register_if6); reg_mif_num = (mifi_t)-1; multicast_register_if6 = NULL; } } bzero((caddr_t)mifp, sizeof(*mifp)); /* Adjust nummifs down */ for (mifi = nummifs; mifi > 0; mifi--) if (mif6table[mifi - 1].m6_ifp) break; nummifs = mifi; splx(s); #ifdef MRT6DEBUG if (V_mrt6debug) log(LOG_DEBUG, "del_m6if %d, nummifs %d\n", *mifip, nummifs); #endif return (0); } /* * Add an mfc entry */ static int add_m6fc(struct mf6cctl *mfccp) { struct mf6c *rt; u_long hash; struct rtdetq *rte; u_short nstl; int s; char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN]; MF6CFIND(mfccp->mf6cc_origin.sin6_addr, mfccp->mf6cc_mcastgrp.sin6_addr, rt); /* If an entry already exists, just update the fields */ if (rt) { #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_MFC) { log(LOG_DEBUG, "add_m6fc no upcall h %d o %s g %s p %x\n", ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent); } #endif s = splnet(); rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; splx(s); return (0); } /* * Find the entry for which the upcall was made and update */ s = splnet(); hash = MF6CHASH(mfccp->mf6cc_origin.sin6_addr, mfccp->mf6cc_mcastgrp.sin6_addr); for (rt = mf6ctable[hash], nstl = 0; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr, &mfccp->mf6cc_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr, &mfccp->mf6cc_mcastgrp.sin6_addr) && (rt->mf6c_stall != NULL)) { if (nstl++) log(LOG_ERR, "add_m6fc: %s o %s g %s p %x dbx %p\n", "multiple kernel entries", ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent, rt->mf6c_stall); #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_MFC) log(LOG_DEBUG, "add_m6fc o %s g %s p %x dbg %x\n", ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent, rt->mf6c_stall); #endif rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; rt->mf6c_expire = 0; /* Don't clean this guy up */ n6expire[hash]--; /* free packets Qed at the end of this entry */ for (rte = rt->mf6c_stall; rte != NULL; ) { struct rtdetq *n = rte->next; ip6_mdq(rte->m, rte->ifp, rt); m_freem(rte->m); #ifdef UPCALL_TIMING collate(&(rte->t)); #endif /* UPCALL_TIMING */ free(rte, M_MRTABLE6); rte = n; } rt->mf6c_stall = NULL; } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_MFC) log(LOG_DEBUG, "add_mfc no upcall h %d o %s g %s p %x\n", hash, ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent); #endif for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr, &mfccp->mf6cc_origin.sin6_addr)&& IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr, &mfccp->mf6cc_mcastgrp.sin6_addr)) { rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; if (rt->mf6c_expire) n6expire[hash]--; rt->mf6c_expire = 0; } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE6, M_NOWAIT); if (rt == NULL) { splx(s); return (ENOBUFS); } /* insert new entry at head of hash chain */ rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; rt->mf6c_expire = 0; rt->mf6c_stall = NULL; /* link into table */ rt->mf6c_next = mf6ctable[hash]; mf6ctable[hash] = rt; } } splx(s); return (0); } #ifdef UPCALL_TIMING /* * collect delay statistics on the upcalls */ static void collate(struct timeval *t) { u_long d; struct timeval tp; u_long delta; GET_TIME(tp); if (TV_LT(*t, tp)) { TV_DELTA(tp, *t, delta); d = delta >> 10; if (d > UPCALL_MAX) d = UPCALL_MAX; ++upcall_data[d]; } } #endif /* UPCALL_TIMING */ /* * Delete an mfc entry */ static int del_m6fc(struct mf6cctl *mfccp) { struct sockaddr_in6 origin; struct sockaddr_in6 mcastgrp; struct mf6c *rt; struct mf6c **nptr; u_long hash; int s; origin = mfccp->mf6cc_origin; mcastgrp = mfccp->mf6cc_mcastgrp; hash = MF6CHASH(origin.sin6_addr, mcastgrp.sin6_addr); #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_MFC) { char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN]; log(LOG_DEBUG,"del_m6fc orig %s mcastgrp %s\n", ip6_sprintf(ip6bufo, &origin.sin6_addr), ip6_sprintf(ip6bufg, &mcastgrp.sin6_addr)); } #endif s = splnet(); nptr = &mf6ctable[hash]; while ((rt = *nptr) != NULL) { if (IN6_ARE_ADDR_EQUAL(&origin.sin6_addr, &rt->mf6c_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&mcastgrp.sin6_addr, &rt->mf6c_mcastgrp.sin6_addr) && rt->mf6c_stall == NULL) break; nptr = &rt->mf6c_next; } if (rt == NULL) { splx(s); return (EADDRNOTAVAIL); } *nptr = rt->mf6c_next; free(rt, M_MRTABLE6); splx(s); return (0); } static int socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in6 *src) { if (s) { if (sbappendaddr(&s->so_rcv, (struct sockaddr *)src, mm, (struct mbuf *)0) != 0) { sorwakeup(s); return (0); } } m_freem(mm); return (-1); } /* * IPv6 multicast forwarding function. This function assumes that the packet * pointed to by "ip6" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IPv6 multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. * * NOTE: this implementation assumes that m->m_pkthdr.rcvif is NULL iff * this function is called in the originating context (i.e., not when * forwarding a packet from other node). ip6_output(), which is currently the * only function that calls this function is called in the originating context, * explicitly ensures this condition. It is caller's responsibility to ensure * that if this function is called from somewhere else in the originating * context in the future. */ int X_ip6_mforward(struct ip6_hdr *ip6, struct ifnet *ifp, struct mbuf *m) { INIT_VNET_INET6(curvnet); struct mf6c *rt; struct mif6 *mifp; struct mbuf *mm; int s; mifi_t mifi; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_FORWARD) log(LOG_DEBUG, "ip6_mforward: src %s, dst %s, ifindex %d\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ifp->if_index); #endif /* * Don't forward a packet with Hop limit of zero or one, * or a packet destined to a local-only group. */ if (ip6->ip6_hlim <= 1 || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) || IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) return (0); ip6->ip6_hlim--; /* * Source address check: do not forward packets with unspecified * source. It was discussed in July 2000, on ipngwg mailing list. * This is rather more serious than unicast cases, because some * MLD packets can be sent with the unspecified source address * (although such packets must normally set 1 to the hop limit field). */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { V_ip6stat.ip6s_cantforward++; if (V_ip6_log_time + V_ip6_log_interval < time_second) { V_ip6_log_time = time_second; log(LOG_DEBUG, "cannot forward " "from %s to %s nxt %d received on %s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif)); } return (0); } /* * Determine forwarding mifs from the forwarding cache table */ s = splnet(); MF6CFIND(ip6->ip6_src, ip6->ip6_dst, rt); /* Entry exists, so forward if necessary */ if (rt) { splx(s); return (ip6_mdq(m, ifp, rt)); } else { /* * If we don't have a route for packet's origin, * Make a copy of the packet & * send message to routing daemon */ struct mbuf *mb0; struct rtdetq *rte; u_long hash; /* int i, npkts;*/ #ifdef UPCALL_TIMING struct timeval tp; GET_TIME(tp); #endif /* UPCALL_TIMING */ mrt6stat.mrt6s_no_route++; #ifdef MRT6DEBUG if (V_mrt6debug & (DEBUG_FORWARD | DEBUG_MFC)) log(LOG_DEBUG, "ip6_mforward: no rte s %s g %s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst)); #endif /* * Allocate mbufs early so that we don't do extra work if we * are just going to fail anyway. */ rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE6, M_NOWAIT); if (rte == NULL) { splx(s); return (ENOBUFS); } mb0 = m_copy(m, 0, M_COPYALL); /* * Pullup packet header if needed before storing it, * as other references may modify it in the meantime. */ if (mb0 && (M_HASCL(mb0) || mb0->m_len < sizeof(struct ip6_hdr))) mb0 = m_pullup(mb0, sizeof(struct ip6_hdr)); if (mb0 == NULL) { free(rte, M_MRTABLE6); splx(s); return (ENOBUFS); } /* is there an upcall waiting for this packet? */ hash = MF6CHASH(ip6->ip6_src, ip6->ip6_dst); for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &rt->mf6c_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &rt->mf6c_mcastgrp.sin6_addr) && (rt->mf6c_stall != NULL)) break; } if (rt == NULL) { struct mrt6msg *im; #ifdef MRT6_OINIT struct omrt6msg *oim; #endif /* no upcall, so make a new entry */ rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE6, M_NOWAIT); if (rt == NULL) { free(rte, M_MRTABLE6); m_freem(mb0); splx(s); return (ENOBUFS); } /* * Make a copy of the header to send to the user * level process */ mm = m_copy(mb0, 0, sizeof(struct ip6_hdr)); if (mm == NULL) { free(rte, M_MRTABLE6); m_freem(mb0); free(rt, M_MRTABLE6); splx(s); return (ENOBUFS); } /* * Send message to routing daemon */ sin6.sin6_addr = ip6->ip6_src; im = NULL; #ifdef MRT6_OINIT oim = NULL; #endif switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim = mtod(mm, struct omrt6msg *); oim->im6_msgtype = MRT6MSG_NOCACHE; oim->im6_mbz = 0; break; #endif case MRT6_INIT: im = mtod(mm, struct mrt6msg *); im->im6_msgtype = MRT6MSG_NOCACHE; im->im6_mbz = 0; break; default: free(rte, M_MRTABLE6); m_freem(mb0); free(rt, M_MRTABLE6); splx(s); return (EINVAL); } #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_FORWARD) log(LOG_DEBUG, "getting the iif info in the kernel\n"); #endif for (mifp = mif6table, mifi = 0; mifi < nummifs && mifp->m6_ifp != ifp; mifp++, mifi++) ; switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim->im6_mif = mifi; break; #endif case MRT6_INIT: im->im6_mif = mifi; break; } if (socket_send(ip6_mrouter, mm, &sin6) < 0) { log(LOG_WARNING, "ip6_mforward: ip6_mrouter " "socket queue full\n"); mrt6stat.mrt6s_upq_sockfull++; free(rte, M_MRTABLE6); m_freem(mb0); free(rt, M_MRTABLE6); splx(s); return (ENOBUFS); } mrt6stat.mrt6s_upcalls++; /* insert new entry at head of hash chain */ bzero(rt, sizeof(*rt)); rt->mf6c_origin.sin6_family = AF_INET6; rt->mf6c_origin.sin6_len = sizeof(struct sockaddr_in6); rt->mf6c_origin.sin6_addr = ip6->ip6_src; rt->mf6c_mcastgrp.sin6_family = AF_INET6; rt->mf6c_mcastgrp.sin6_len = sizeof(struct sockaddr_in6); rt->mf6c_mcastgrp.sin6_addr = ip6->ip6_dst; rt->mf6c_expire = UPCALL_EXPIRE; n6expire[hash]++; rt->mf6c_parent = MF6C_INCOMPLETE_PARENT; /* link into table */ rt->mf6c_next = mf6ctable[hash]; mf6ctable[hash] = rt; /* Add this entry to the end of the queue */ rt->mf6c_stall = rte; } else { /* determine if q has overflowed */ struct rtdetq **p; int npkts = 0; for (p = &rt->mf6c_stall; *p != NULL; p = &(*p)->next) if (++npkts > MAX_UPQ6) { mrt6stat.mrt6s_upq_ovflw++; free(rte, M_MRTABLE6); m_freem(mb0); splx(s); return (0); } /* Add this entry to the end of the queue */ *p = rte; } rte->next = NULL; rte->m = mb0; rte->ifp = ifp; #ifdef UPCALL_TIMING rte->t = tp; #endif /* UPCALL_TIMING */ splx(s); return (0); } } /* * Clean up cache entries if upcalls are not serviced * Call from the Slow Timeout mechanism, every half second. */ static void expire_upcalls(void *unused) { struct rtdetq *rte; struct mf6c *mfc, **nptr; int i; int s; s = splnet(); for (i = 0; i < MF6CTBLSIZ; i++) { if (n6expire[i] == 0) continue; nptr = &mf6ctable[i]; while ((mfc = *nptr) != NULL) { rte = mfc->mf6c_stall; /* * Skip real cache entries * Make sure it wasn't marked to not expire (shouldn't happen) * If it expires now */ if (rte != NULL && mfc->mf6c_expire != 0 && --mfc->mf6c_expire == 0) { #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_EXPIRE) { char ip6bufo[INET6_ADDRSTRLEN]; char ip6bufg[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "expire_upcalls: expiring (%s %s)\n", ip6_sprintf(ip6bufo, &mfc->mf6c_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfc->mf6c_mcastgrp.sin6_addr)); } #endif /* * drop all the packets * free the mbuf with the pkt, if, timing info */ do { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE6); rte = n; } while (rte != NULL); mrt6stat.mrt6s_cache_cleanups++; n6expire[i]--; *nptr = mfc->mf6c_next; free(mfc, M_MRTABLE6); } else { nptr = &mfc->mf6c_next; } } } splx(s); callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); } /* * Packet forwarding routine once entry in the cache is made */ static int ip6_mdq(struct mbuf *m, struct ifnet *ifp, struct mf6c *rt) { INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); mifi_t mifi, iif; struct mif6 *mifp; int plen = m->m_pkthdr.len; struct in6_addr src0, dst0; /* copies for local work */ u_int32_t iszone, idzone, oszone, odzone; int error = 0; /* * Macro to send packet on mif. Since RSVP packets don't get counted on * input, they shouldn't get counted on output, so statistics keeping is * separate. */ #define MC6_SEND(ip6, mifp, m) do { \ if ((mifp)->m6_flags & MIFF_REGISTER) \ register_send((ip6), (mifp), (m)); \ else \ phyint_send((ip6), (mifp), (m)); \ } while (/*CONSTCOND*/ 0) /* * Don't forward if it didn't arrive from the parent mif * for its origin. */ mifi = rt->mf6c_parent; if ((mifi >= nummifs) || (mif6table[mifi].m6_ifp != ifp)) { /* came in the wrong interface */ #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_FORWARD) log(LOG_DEBUG, "wrong if: ifid %d mifi %d mififid %x\n", ifp->if_index, mifi, mif6table[mifi].m6_ifp->if_index); #endif mrt6stat.mrt6s_wrong_if++; rt->mf6c_wrong_if++; /* * If we are doing PIM processing, and we are forwarding * packets on this interface, send a message to the * routing daemon. */ /* have to make sure this is a valid mif */ if (mifi < nummifs && mif6table[mifi].m6_ifp) if (V_pim6 && (m->m_flags & M_LOOP) == 0) { /* * Check the M_LOOP flag to avoid an * unnecessary PIM assert. * XXX: M_LOOP is an ad-hoc hack... */ static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 }; struct mbuf *mm; struct mrt6msg *im; #ifdef MRT6_OINIT struct omrt6msg *oim; #endif mm = m_copy(m, 0, sizeof(struct ip6_hdr)); if (mm && (M_HASCL(mm) || mm->m_len < sizeof(struct ip6_hdr))) mm = m_pullup(mm, sizeof(struct ip6_hdr)); if (mm == NULL) return (ENOBUFS); #ifdef MRT6_OINIT oim = NULL; #endif im = NULL; switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim = mtod(mm, struct omrt6msg *); oim->im6_msgtype = MRT6MSG_WRONGMIF; oim->im6_mbz = 0; break; #endif case MRT6_INIT: im = mtod(mm, struct mrt6msg *); im->im6_msgtype = MRT6MSG_WRONGMIF; im->im6_mbz = 0; break; default: m_freem(mm); return (EINVAL); } for (mifp = mif6table, iif = 0; iif < nummifs && mifp && mifp->m6_ifp != ifp; mifp++, iif++) ; switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim->im6_mif = iif; sin6.sin6_addr = oim->im6_src; break; #endif case MRT6_INIT: im->im6_mif = iif; sin6.sin6_addr = im->im6_src; break; } mrt6stat.mrt6s_upcalls++; if (socket_send(ip6_mrouter, mm, &sin6) < 0) { #ifdef MRT6DEBUG if (V_mrt6debug) log(LOG_WARNING, "mdq, ip6_mrouter socket queue full\n"); #endif ++mrt6stat.mrt6s_upq_sockfull; return (ENOBUFS); } /* if socket Q full */ } /* if PIM */ return (0); } /* if wrong iif */ /* If I sourced this packet, it counts as output, else it was input. */ if (m->m_pkthdr.rcvif == NULL) { /* XXX: is rcvif really NULL when output?? */ mif6table[mifi].m6_pkt_out++; mif6table[mifi].m6_bytes_out += plen; } else { mif6table[mifi].m6_pkt_in++; mif6table[mifi].m6_bytes_in += plen; } rt->mf6c_pkt_cnt++; rt->mf6c_byte_cnt += plen; /* * For each mif, forward a copy of the packet if there are group * members downstream on the interface. */ src0 = ip6->ip6_src; dst0 = ip6->ip6_dst; if ((error = in6_setscope(&src0, ifp, &iszone)) != 0 || (error = in6_setscope(&dst0, ifp, &idzone)) != 0) { V_ip6stat.ip6s_badscope++; return (error); } for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) { if (IF_ISSET(mifi, &rt->mf6c_ifset)) { /* * check if the outgoing packet is going to break * a scope boundary. * XXX For packets through PIM register tunnel * interface, we believe a routing daemon. */ if (!(mif6table[rt->mf6c_parent].m6_flags & MIFF_REGISTER) && !(mif6table[mifi].m6_flags & MIFF_REGISTER)) { if (in6_setscope(&src0, mif6table[mifi].m6_ifp, &oszone) || in6_setscope(&dst0, mif6table[mifi].m6_ifp, &odzone) || iszone != oszone || idzone != odzone) { V_ip6stat.ip6s_badscope++; continue; } } mifp->m6_pkt_out++; mifp->m6_bytes_out += plen; MC6_SEND(ip6, mifp, m); } } return (0); } static void phyint_send(struct ip6_hdr *ip6, struct mif6 *mifp, struct mbuf *m) { INIT_VNET_INET6(curvnet); struct mbuf *mb_copy; struct ifnet *ifp = mifp->m6_ifp; int error = 0; int s = splnet(); /* needs to protect static "ro" below. */ static struct route_in6 ro; struct in6_multi *in6m; struct sockaddr_in6 *dst6; u_long linkmtu; /* * Make a new reference to the packet; make sure that * the IPv6 header is actually copied, not just referenced, * so that ip6_output() only scribbles on the copy. */ mb_copy = m_copy(m, 0, M_COPYALL); if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < sizeof(struct ip6_hdr))) mb_copy = m_pullup(mb_copy, sizeof(struct ip6_hdr)); if (mb_copy == NULL) { splx(s); return; } /* set MCAST flag to the outgoing packet */ mb_copy->m_flags |= M_MCAST; /* * If we sourced the packet, call ip6_output since we may devide * the packet into fragments when the packet is too big for the * outgoing interface. * Otherwise, we can simply send the packet to the interface * sending queue. */ if (m->m_pkthdr.rcvif == NULL) { struct ip6_moptions im6o; im6o.im6o_multicast_ifp = ifp; /* XXX: ip6_output will override ip6->ip6_hlim */ im6o.im6o_multicast_hlim = ip6->ip6_hlim; im6o.im6o_multicast_loop = 1; error = ip6_output(mb_copy, NULL, &ro, IPV6_FORWARDING, &im6o, NULL, NULL); #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on mif %d err %d\n", mifp - mif6table, error); #endif splx(s); return; } /* * If we belong to the destination multicast group * on the outgoing interface, loop back a copy. */ dst6 = (struct sockaddr_in6 *)&ro.ro_dst; IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m); if (in6m != NULL) { dst6->sin6_len = sizeof(struct sockaddr_in6); dst6->sin6_family = AF_INET6; dst6->sin6_addr = ip6->ip6_dst; ip6_mloopback(ifp, m, (struct sockaddr_in6 *)&ro.ro_dst); } /* * Put the packet into the sending queue of the outgoing interface * if it would fit in the MTU of the interface. */ linkmtu = IN6_LINKMTU(ifp); if (mb_copy->m_pkthdr.len <= linkmtu || linkmtu < IPV6_MMTU) { dst6->sin6_len = sizeof(struct sockaddr_in6); dst6->sin6_family = AF_INET6; dst6->sin6_addr = ip6->ip6_dst; /* * We just call if_output instead of nd6_output here, since * we need no ND for a multicast forwarded packet...right? */ error = (*ifp->if_output)(ifp, mb_copy, (struct sockaddr *)&ro.ro_dst, NULL); #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on mif %d err %d\n", mifp - mif6table, error); #endif } else { /* * pMTU discovery is intentionally disabled by default, since * various router may notify pMTU in multicast, which can be * a DDoS to a router */ if (V_ip6_mcast_pmtu) icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, linkmtu); else { #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_XMIT) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "phyint_send: packet too big on %s o %s " "g %s size %d(discarded)\n", if_name(ifp), ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), mb_copy->m_pkthdr.len); } #endif /* MRT6DEBUG */ m_freem(mb_copy); /* simply discard the packet */ } } splx(s); } static int register_send(struct ip6_hdr *ip6, struct mif6 *mif, struct mbuf *m) { struct mbuf *mm; int i, len = m->m_pkthdr.len; static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 }; struct mrt6msg *im6; #ifdef MRT6DEBUG if (V_mrt6debug) { char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "** IPv6 register_send **\n src %s dst %s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst)); } #endif ++pim6stat.pim6s_snd_registers; /* Make a copy of the packet to send to the user level process */ MGETHDR(mm, M_DONTWAIT, MT_HEADER); if (mm == NULL) return (ENOBUFS); mm->m_pkthdr.rcvif = NULL; mm->m_data += max_linkhdr; mm->m_len = sizeof(struct ip6_hdr); if ((mm->m_next = m_copy(m, 0, M_COPYALL)) == NULL) { m_freem(mm); return (ENOBUFS); } i = MHLEN - M_LEADINGSPACE(mm); if (i > len) i = len; mm = m_pullup(mm, i); if (mm == NULL) return (ENOBUFS); /* TODO: check it! */ mm->m_pkthdr.len = len + sizeof(struct ip6_hdr); /* * Send message to routing daemon */ sin6.sin6_addr = ip6->ip6_src; im6 = mtod(mm, struct mrt6msg *); im6->im6_msgtype = MRT6MSG_WHOLEPKT; im6->im6_mbz = 0; im6->im6_mif = mif - mif6table; /* iif info is not given for reg. encap.n */ mrt6stat.mrt6s_upcalls++; if (socket_send(ip6_mrouter, mm, &sin6) < 0) { #ifdef MRT6DEBUG if (V_mrt6debug) log(LOG_WARNING, "register_send: ip6_mrouter socket queue full\n"); #endif ++mrt6stat.mrt6s_upq_sockfull; return (ENOBUFS); } return (0); } /* * PIM sparse mode hook * Receives the pim control messages, and passes them up to the listening * socket, using rip6_input. * The only message processed is the REGISTER pim message; the pim header * is stripped off, and the inner packet is passed to register_mforward. */ int pim6_input(struct mbuf **mp, int *offp, int proto) { INIT_VNET_INET6(curvnet); struct pim *pim; /* pointer to a pim struct */ struct ip6_hdr *ip6; int pimlen; struct mbuf *m = *mp; int minlen; int off = *offp; ++pim6stat.pim6s_rcv_total; ip6 = mtod(m, struct ip6_hdr *); pimlen = m->m_pkthdr.len - *offp; /* * Validate lengths */ if (pimlen < PIM_MINLEN) { ++pim6stat.pim6s_rcv_tooshort; #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_PIM) log(LOG_DEBUG,"pim6_input: PIM packet too short\n"); #endif m_freem(m); return (IPPROTO_DONE); } /* * if the packet is at least as big as a REGISTER, go ahead * and grab the PIM REGISTER header size, to avoid another * possible m_pullup() later. * * PIM_MINLEN == pimhdr + u_int32 == 8 * PIM6_REG_MINLEN == pimhdr + reghdr + eip6hdr == 4 + 4 + 40 */ minlen = (pimlen >= PIM6_REG_MINLEN) ? PIM6_REG_MINLEN : PIM_MINLEN; /* * Make sure that the IP6 and PIM headers in contiguous memory, and * possibly the PIM REGISTER header */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, minlen, IPPROTO_DONE); /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf to point to the PIM header */ pim = (struct pim *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(pim, struct pim *, m, off, minlen); if (pim == NULL) { pim6stat.pim6s_rcv_tooshort++; return (IPPROTO_DONE); } #endif #define PIM6_CHECKSUM #ifdef PIM6_CHECKSUM { int cksumlen; /* * Validate checksum. * If PIM REGISTER, exclude the data packet */ if (pim->pim_type == PIM_REGISTER) cksumlen = PIM_MINLEN; else cksumlen = pimlen; if (in6_cksum(m, IPPROTO_PIM, off, cksumlen)) { ++pim6stat.pim6s_rcv_badsum; #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_PIM) log(LOG_DEBUG, "pim6_input: invalid checksum\n"); #endif m_freem(m); return (IPPROTO_DONE); } } #endif /* PIM_CHECKSUM */ /* PIM version check */ if (pim->pim_ver != PIM_VERSION) { ++pim6stat.pim6s_rcv_badversion; #ifdef MRT6DEBUG log(LOG_ERR, "pim6_input: incorrect version %d, expecting %d\n", pim->pim_ver, PIM_VERSION); #endif m_freem(m); return (IPPROTO_DONE); } if (pim->pim_type == PIM_REGISTER) { /* * since this is a REGISTER, we'll make a copy of the register * headers ip6+pim+u_int32_t+encap_ip6, to be passed up to the * routing daemon. */ static struct sockaddr_in6 dst = { sizeof(dst), AF_INET6 }; struct mbuf *mcp; struct ip6_hdr *eip6; u_int32_t *reghdr; int rc; #ifdef MRT6DEBUG char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #endif ++pim6stat.pim6s_rcv_registers; if ((reg_mif_num >= nummifs) || (reg_mif_num == (mifi_t) -1)) { #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_PIM) log(LOG_DEBUG, "pim6_input: register mif not set: %d\n", reg_mif_num); #endif m_freem(m); return (IPPROTO_DONE); } reghdr = (u_int32_t *)(pim + 1); if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) goto pim6_input_to_daemon; /* * Validate length */ if (pimlen < PIM6_REG_MINLEN) { ++pim6stat.pim6s_rcv_tooshort; ++pim6stat.pim6s_rcv_badregisters; #ifdef MRT6DEBUG log(LOG_ERR, "pim6_input: register packet size too " "small %d from %s\n", pimlen, ip6_sprintf(ip6bufs, &ip6->ip6_src)); #endif m_freem(m); return (IPPROTO_DONE); } eip6 = (struct ip6_hdr *) (reghdr + 1); #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_PIM) log(LOG_DEBUG, "pim6_input[register], eip6: %s -> %s, " "eip6 plen %d\n", ip6_sprintf(ip6bufs, &eip6->ip6_src), ip6_sprintf(ip6bufd, &eip6->ip6_dst), ntohs(eip6->ip6_plen)); #endif /* verify the version number of the inner packet */ if ((eip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { ++pim6stat.pim6s_rcv_badregisters; #ifdef MRT6DEBUG log(LOG_DEBUG, "pim6_input: invalid IP version (%d) " "of the inner packet\n", (eip6->ip6_vfc & IPV6_VERSION)); #endif m_freem(m); return (IPPROTO_NONE); } /* verify the inner packet is destined to a mcast group */ if (!IN6_IS_ADDR_MULTICAST(&eip6->ip6_dst)) { ++pim6stat.pim6s_rcv_badregisters; #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_PIM) log(LOG_DEBUG, "pim6_input: inner packet of register " "is not multicast %s\n", ip6_sprintf(ip6bufd, &eip6->ip6_dst)); #endif m_freem(m); return (IPPROTO_DONE); } /* * make a copy of the whole header to pass to the daemon later. */ mcp = m_copy(m, 0, off + PIM6_REG_MINLEN); if (mcp == NULL) { #ifdef MRT6DEBUG log(LOG_ERR, "pim6_input: pim register: " "could not copy register head\n"); #endif m_freem(m); return (IPPROTO_DONE); } /* * forward the inner ip6 packet; point m_data at the inner ip6. */ m_adj(m, off + PIM_MINLEN); #ifdef MRT6DEBUG if (V_mrt6debug & DEBUG_PIM) { log(LOG_DEBUG, "pim6_input: forwarding decapsulated register: " "src %s, dst %s, mif %d\n", ip6_sprintf(ip6bufs, &eip6->ip6_src), ip6_sprintf(ip6bufd, &eip6->ip6_dst), reg_mif_num); } #endif rc = if_simloop(mif6table[reg_mif_num].m6_ifp, m, dst.sin6_family, 0); /* prepare the register head to send to the mrouting daemon */ m = mcp; } /* * Pass the PIM message up to the daemon; if it is a register message * pass the 'head' only up to the daemon. This includes the * encapsulator ip6 header, pim header, register header and the * encapsulated ip6 header. */ pim6_input_to_daemon: rip6_input(&m, offp, proto); return (IPPROTO_DONE); } Index: head/sys/netinet6/mld6.c =================================================================== --- head/sys/netinet6/mld6.c (revision 185087) +++ head/sys/netinet6/mld6.c (revision 185088) @@ -1,654 +1,656 @@ /*- * Copyright (C) 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $ */ /*- * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Protocol constants */ /* denotes that the MLD max response delay field specifies time in milliseconds */ #define MLD_TIMER_SCALE 1000 /* * time between repetitions of a node's initial report of interest in a * multicast address(in seconds) */ #define MLD_UNSOLICITED_REPORT_INTERVAL 10 +#ifdef VIMAGE_GLOBALS static struct ip6_pktopts ip6_opts; +#endif static void mld6_sendpkt(struct in6_multi *, int, const struct in6_addr *); static void mld_starttimer(struct in6_multi *); static void mld_stoptimer(struct in6_multi *); static void mld_timeo(struct in6_multi *); static u_long mld_timerresid(struct in6_multi *); void mld6_init(void) { INIT_VNET_INET6(curvnet); static u_int8_t hbh_buf[8]; struct ip6_hbh *hbh = (struct ip6_hbh *)hbh_buf; u_int16_t rtalert_code = htons((u_int16_t)IP6OPT_RTALERT_MLD); /* ip6h_nxt will be fill in later */ hbh->ip6h_len = 0; /* (8 >> 3) - 1 */ /* XXX: grotty hard coding... */ hbh_buf[2] = IP6OPT_PADN; /* 2 byte padding */ hbh_buf[3] = 0; hbh_buf[4] = IP6OPT_ROUTER_ALERT; hbh_buf[5] = IP6OPT_RTALERT_LEN - 2; bcopy((caddr_t)&rtalert_code, &hbh_buf[6], sizeof(u_int16_t)); ip6_initpktopts(&V_ip6_opts); V_ip6_opts.ip6po_hbh = hbh; } static void mld_starttimer(struct in6_multi *in6m) { struct timeval now; microtime(&now); in6m->in6m_timer_expire.tv_sec = now.tv_sec + in6m->in6m_timer / hz; in6m->in6m_timer_expire.tv_usec = now.tv_usec + (in6m->in6m_timer % hz) * (1000000 / hz); if (in6m->in6m_timer_expire.tv_usec > 1000000) { in6m->in6m_timer_expire.tv_sec++; in6m->in6m_timer_expire.tv_usec -= 1000000; } /* start or restart the timer */ callout_reset(in6m->in6m_timer_ch, in6m->in6m_timer, (void (*)(void *))mld_timeo, in6m); } static void mld_stoptimer(struct in6_multi *in6m) { if (in6m->in6m_timer == IN6M_TIMER_UNDEF) return; callout_stop(in6m->in6m_timer_ch); in6m->in6m_timer = IN6M_TIMER_UNDEF; } static void mld_timeo(struct in6_multi *in6m) { int s = splnet(); in6m->in6m_timer = IN6M_TIMER_UNDEF; callout_stop(in6m->in6m_timer_ch); switch (in6m->in6m_state) { case MLD_REPORTPENDING: mld6_start_listening(in6m); break; default: mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); break; } splx(s); } static u_long mld_timerresid(struct in6_multi *in6m) { struct timeval now, diff; microtime(&now); if (now.tv_sec > in6m->in6m_timer_expire.tv_sec || (now.tv_sec == in6m->in6m_timer_expire.tv_sec && now.tv_usec > in6m->in6m_timer_expire.tv_usec)) { return (0); } diff = in6m->in6m_timer_expire; diff.tv_sec -= now.tv_sec; diff.tv_usec -= now.tv_usec; if (diff.tv_usec < 0) { diff.tv_sec--; diff.tv_usec += 1000000; } /* return the remaining time in milliseconds */ return (diff.tv_sec * 1000 + diff.tv_usec / 1000); } void mld6_start_listening(struct in6_multi *in6m) { struct in6_addr all_in6; int s = splnet(); /* * RFC2710 page 10: * The node never sends a Report or Done for the link-scope all-nodes * address. * MLD messages are never sent for multicast addresses whose scope is 0 * (reserved) or 1 (node-local). */ all_in6 = in6addr_linklocal_allnodes; if (in6_setscope(&all_in6, in6m->in6m_ifp, NULL)) { /* XXX: this should not happen! */ in6m->in6m_timer = 0; in6m->in6m_state = MLD_OTHERLISTENER; } if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_in6) || IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) { in6m->in6m_timer = 0; in6m->in6m_state = MLD_OTHERLISTENER; } else { mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); in6m->in6m_timer = arc4random() % MLD_UNSOLICITED_REPORT_INTERVAL * hz; in6m->in6m_state = MLD_IREPORTEDLAST; mld_starttimer(in6m); } splx(s); } void mld6_stop_listening(struct in6_multi *in6m) { struct in6_addr allnode, allrouter; allnode = in6addr_linklocal_allnodes; if (in6_setscope(&allnode, in6m->in6m_ifp, NULL)) { /* XXX: this should not happen! */ return; } allrouter = in6addr_linklocal_allrouters; if (in6_setscope(&allrouter, in6m->in6m_ifp, NULL)) { /* XXX impossible */ return; } if (in6m->in6m_state == MLD_IREPORTEDLAST && !IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &allnode) && IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) > IPV6_ADDR_SCOPE_INTFACELOCAL) { mld6_sendpkt(in6m, MLD_LISTENER_DONE, &allrouter); } } void mld6_input(struct mbuf *m, int off) { INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct mld_hdr *mldh; struct ifnet *ifp = m->m_pkthdr.rcvif; struct in6_multi *in6m; struct in6_addr mld_addr, all_in6; struct in6_ifaddr *ia; struct ifmultiaddr *ifma; u_long timer; /* timer value in the MLD query header */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(*mldh),); mldh = (struct mld_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(mldh, struct mld_hdr *, m, off, sizeof(*mldh)); if (mldh == NULL) { V_icmp6stat.icp6s_tooshort++; return; } #endif /* source address validation */ ip6 = mtod(m, struct ip6_hdr *); /* in case mpullup */ if (!IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) { char ip6bufs[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN]; log(LOG_ERR, "mld6_input: src %s is not link-local (grp=%s)\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufg, &mldh->mld_addr)); /* * spec (RFC2710) does not explicitly * specify to discard the packet from a non link-local * source address. But we believe it's expected to do so. * XXX: do we have to allow :: as source? */ m_freem(m); return; } /* * make a copy for local work (in6_setscope() may modify the 1st arg) */ mld_addr = mldh->mld_addr; if (in6_setscope(&mld_addr, ifp, NULL)) { /* XXX: this should not happen! */ m_free(m); return; } /* * In the MLD6 specification, there are 3 states and a flag. * * In Non-Listener state, we simply don't have a membership record. * In Delaying Listener state, our timer is running (in6m->in6m_timer) * In Idle Listener state, our timer is not running * (in6m->in6m_timer==IN6M_TIMER_UNDEF) * * The flag is in6m->in6m_state, it is set to MLD_OTHERLISTENER if * we have heard a report from another member, or MLD_IREPORTEDLAST * if we sent the last report. */ switch(mldh->mld_type) { case MLD_LISTENER_QUERY: if (ifp->if_flags & IFF_LOOPBACK) break; if (!IN6_IS_ADDR_UNSPECIFIED(&mld_addr) && !IN6_IS_ADDR_MULTICAST(&mld_addr)) break; /* print error or log stat? */ all_in6 = in6addr_linklocal_allnodes; if (in6_setscope(&all_in6, ifp, NULL)) { /* XXX: this should not happen! */ break; } /* * - Start the timers in all of our membership records * that the query applies to for the interface on * which the query arrived excl. those that belong * to the "all-nodes" group (ff02::1). * - Restart any timer that is already running but has * A value longer than the requested timeout. * - Use the value specified in the query message as * the maximum timeout. */ timer = ntohs(mldh->mld_maxdelay); IFP_TO_IA6(ifp, ia); if (ia == NULL) break; /* * XXX: System timer resolution is too low to handle Max * Response Delay, so set 1 to the internal timer even if * the calculated value equals to zero when Max Response * Delay is positive. */ timer = ntohs(mldh->mld_maxdelay) * PR_FASTHZ / MLD_TIMER_SCALE; if (timer == 0 && mldh->mld_maxdelay) timer = 1; IF_ADDR_LOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6) continue; in6m = (struct in6_multi *)ifma->ifma_protospec; if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_in6) || IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) continue; if (IN6_IS_ADDR_UNSPECIFIED(&mld_addr) || IN6_ARE_ADDR_EQUAL(&mld_addr, &in6m->in6m_addr)) { if (timer == 0) { /* send a report immediately */ mld_stoptimer(in6m); mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); in6m->in6m_timer = 0; /* reset timer */ in6m->in6m_state = MLD_IREPORTEDLAST; } else if (in6m->in6m_timer == IN6M_TIMER_UNDEF || mld_timerresid(in6m) > timer) { in6m->in6m_timer = 1 + (arc4random() % timer) * hz / 1000; mld_starttimer(in6m); } } } IF_ADDR_UNLOCK(ifp); break; case MLD_LISTENER_REPORT: /* * For fast leave to work, we have to know that we are the * last person to send a report for this group. Reports * can potentially get looped back if we are a multicast * router, so discard reports sourced by me. * Note that it is impossible to check IFF_LOOPBACK flag of * ifp for this purpose, since ip6_mloopback pass the physical * interface to looutput. */ if (m->m_flags & M_LOOP) /* XXX: grotty flag, but efficient */ break; if (!IN6_IS_ADDR_MULTICAST(&mld_addr)) break; /* * If we belong to the group being reported, stop * our timer for that group. */ IN6_LOOKUP_MULTI(mld_addr, ifp, in6m); if (in6m) { in6m->in6m_timer = 0; /* transit to idle state */ in6m->in6m_state = MLD_OTHERLISTENER; /* clear flag */ } break; default: /* this is impossible */ log(LOG_ERR, "mld6_input: illegal type(%d)", mldh->mld_type); break; } m_freem(m); } static void mld6_sendpkt(struct in6_multi *in6m, int type, const struct in6_addr *dst) { INIT_VNET_INET6(curvnet); struct mbuf *mh, *md; struct mld_hdr *mldh; struct ip6_hdr *ip6; struct ip6_moptions im6o; struct in6_ifaddr *ia; struct ifnet *ifp = in6m->in6m_ifp; struct ifnet *outif = NULL; /* * At first, find a link local address on the outgoing interface * to use as the source address of the MLD packet. */ if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST)) == NULL) return; /* * Allocate mbufs to store ip6 header and MLD header. * We allocate 2 mbufs and make chain in advance because * it is more convenient when inserting the hop-by-hop option later. */ MGETHDR(mh, M_DONTWAIT, MT_HEADER); if (mh == NULL) return; MGET(md, M_DONTWAIT, MT_DATA); if (md == NULL) { m_free(mh); return; } mh->m_next = md; mh->m_pkthdr.rcvif = NULL; mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr); mh->m_len = sizeof(struct ip6_hdr); MH_ALIGN(mh, sizeof(struct ip6_hdr)); /* fill in the ip6 header */ ip6 = mtod(mh, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; /* ip6_hlim will be set by im6o.im6o_multicast_hlim */ ip6->ip6_src = ia->ia_addr.sin6_addr; ip6->ip6_dst = dst ? *dst : in6m->in6m_addr; /* fill in the MLD header */ md->m_len = sizeof(struct mld_hdr); mldh = mtod(md, struct mld_hdr *); mldh->mld_type = type; mldh->mld_code = 0; mldh->mld_cksum = 0; /* XXX: we assume the function will not be called for query messages */ mldh->mld_maxdelay = 0; mldh->mld_reserved = 0; mldh->mld_addr = in6m->in6m_addr; in6_clearscope(&mldh->mld_addr); /* XXX */ mldh->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), sizeof(struct mld_hdr)); /* construct multicast option */ bzero(&im6o, sizeof(im6o)); im6o.im6o_multicast_ifp = ifp; im6o.im6o_multicast_hlim = 1; /* * Request loopback of the report if we are acting as a multicast * router, so that the process-level routing daemon can hear it. */ im6o.im6o_multicast_loop = (ip6_mrouter != NULL); /* increment output statictics */ V_icmp6stat.icp6s_outhist[type]++; ip6_output(mh, &V_ip6_opts, NULL, 0, &im6o, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); switch (type) { case MLD_LISTENER_QUERY: icmp6_ifstat_inc(outif, ifs6_out_mldquery); break; case MLD_LISTENER_REPORT: icmp6_ifstat_inc(outif, ifs6_out_mldreport); break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(outif, ifs6_out_mlddone); break; } } } /* * Add an address to the list of IP6 multicast addresses for a given interface. * Add source addresses to the list also, if upstream router is MLDv2 capable * and the number of source is not 0. */ struct in6_multi * in6_addmulti(struct in6_addr *maddr6, struct ifnet *ifp, int *errorp, int delay) { struct in6_multi *in6m; *errorp = 0; in6m = NULL; IFF_LOCKGIANT(ifp); /*IN6_MULTI_LOCK();*/ IN6_LOOKUP_MULTI(*maddr6, ifp, in6m); if (in6m != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(in6m->in6m_refcount >= 1, ("%s: bad refcount %d", __func__, in6m->in6m_refcount)); ++in6m->in6m_refcount; } else do { struct in6_multi *nin6m; struct ifmultiaddr *ifma; struct sockaddr_in6 sa6; bzero(&sa6, sizeof(sa6)); sa6.sin6_family = AF_INET6; sa6.sin6_len = sizeof(struct sockaddr_in6); sa6.sin6_addr = *maddr6; *errorp = if_addmulti(ifp, (struct sockaddr *)&sa6, &ifma); if (*errorp) break; /* * If ifma->ifma_protospec is null, then if_addmulti() created * a new record. Otherwise, bump refcount, and we are done. */ if (ifma->ifma_protospec != NULL) { in6m = ifma->ifma_protospec; ++in6m->in6m_refcount; break; } nin6m = malloc(sizeof(*nin6m), M_IP6MADDR, M_NOWAIT | M_ZERO); if (nin6m == NULL) { if_delmulti_ifma(ifma); break; } nin6m->in6m_addr = *maddr6; nin6m->in6m_ifp = ifp; nin6m->in6m_refcount = 1; nin6m->in6m_ifma = ifma; ifma->ifma_protospec = nin6m; nin6m->in6m_timer_ch = malloc(sizeof(*nin6m->in6m_timer_ch), M_IP6MADDR, M_NOWAIT); if (nin6m->in6m_timer_ch == NULL) { free(nin6m, M_IP6MADDR); if_delmulti_ifma(ifma); break; } LIST_INSERT_HEAD(&in6_multihead, nin6m, in6m_entry); callout_init(nin6m->in6m_timer_ch, 0); nin6m->in6m_timer = delay; if (nin6m->in6m_timer > 0) { nin6m->in6m_state = MLD_REPORTPENDING; mld_starttimer(nin6m); } mld6_start_listening(nin6m); in6m = nin6m; } while (0); /*IN6_MULTI_UNLOCK();*/ IFF_UNLOCKGIANT(ifp); return (in6m); } /* * Delete a multicast address record. * * TODO: Locking, as per netinet. */ void in6_delmulti(struct in6_multi *in6m) { struct ifmultiaddr *ifma; KASSERT(in6m->in6m_refcount >= 1, ("%s: freeing freed in6m", __func__)); if (--in6m->in6m_refcount == 0) { mld_stoptimer(in6m); mld6_stop_listening(in6m); ifma = in6m->in6m_ifma; KASSERT(ifma->ifma_protospec == in6m, ("%s: ifma_protospec != in6m", __func__)); ifma->ifma_protospec = NULL; LIST_REMOVE(in6m, in6m_entry); free(in6m->in6m_timer_ch, M_IP6MADDR); free(in6m, M_IP6MADDR); if_delmulti_ifma(ifma); } } Index: head/sys/netinet6/nd6.c =================================================================== --- head/sys/netinet6/nd6.c (revision 185087) +++ head/sys/netinet6/nd6.c (revision 185088) @@ -1,2425 +1,2460 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */ #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */ #define SIN6(s) ((struct sockaddr_in6 *)s) #define SDL(s) ((struct sockaddr_dl *)s) -/* timer values */ -int nd6_prune = 1; /* walk list every 1 seconds */ -int nd6_delay = 5; /* delay first probe time 5 second */ -int nd6_umaxtries = 3; /* maximum unicast query */ -int nd6_mmaxtries = 3; /* maximum multicast query */ -int nd6_useloopback = 1; /* use loopback interface for local traffic */ -int nd6_gctimer = (60 * 60 * 24); /* 1 day: garbage collection timer */ +#ifdef VIMAGE_GLOBALS +int nd6_prune; +int nd6_delay; +int nd6_umaxtries; +int nd6_mmaxtries; +int nd6_useloopback; +int nd6_gctimer; /* preventing too many loops in ND option parsing */ -int nd6_maxndopt = 10; /* max # of ND options allowed */ +int nd6_maxndopt; -int nd6_maxnudhint = 0; /* max # of subsequent upper layer hints */ -int nd6_maxqueuelen = 1; /* max # of packets cached in unresolved ND entries */ +int nd6_maxnudhint; +int nd6_maxqueuelen; -#ifdef ND6_DEBUG -int nd6_debug = 1; -#else -int nd6_debug = 0; -#endif +int nd6_debug; /* for debugging? */ static int nd6_inuse, nd6_allocated; +struct llinfo_nd6 llinfo_nd6; -struct llinfo_nd6 llinfo_nd6 = {&llinfo_nd6, &llinfo_nd6}; struct nd_drhead nd_defrouter; -struct nd_prhead nd_prefix = { 0 }; +struct nd_prhead nd_prefix; -int nd6_recalc_reachtm_interval = ND6_RECALC_REACHTM_INTERVAL; +int nd6_recalc_reachtm_interval; +#endif /* VIMAGE_GLOBALS */ + static struct sockaddr_in6 all1_sa; static int nd6_is_new_addr_neighbor __P((struct sockaddr_in6 *, struct ifnet *)); static void nd6_setmtu0(struct ifnet *, struct nd_ifinfo *); static void nd6_slowtimo(void *); static int regen_tmpaddr(struct in6_ifaddr *); static struct llinfo_nd6 *nd6_free(struct rtentry *, int); static void nd6_llinfo_timer(void *); static void clear_llinfo_pqueue(struct llinfo_nd6 *); +#ifdef VIMAGE_GLOBALS struct callout nd6_slowtimo_ch; struct callout nd6_timer_ch; extern struct callout in6_tmpaddrtimer_ch; +extern int dad_ignore_ns; +extern int dad_maxtry; +#endif void nd6_init(void) { INIT_VNET_INET6(curvnet); static int nd6_init_done = 0; int i; if (nd6_init_done) { log(LOG_NOTICE, "nd6_init called more than once(ignored)\n"); return; } + + V_nd6_prune = 1; /* walk list every 1 seconds */ + V_nd6_delay = 5; /* delay first probe time 5 second */ + V_nd6_umaxtries = 3; /* maximum unicast query */ + V_nd6_mmaxtries = 3; /* maximum multicast query */ + V_nd6_useloopback = 1; /* use loopback interface for local traffic */ + V_nd6_gctimer = (60 * 60 * 24); /* 1 day: garbage collection timer */ + + /* preventing too many loops in ND option parsing */ + V_nd6_maxndopt = 10; /* max # of ND options allowed */ + + V_nd6_maxnudhint = 0; /* max # of subsequent upper layer hints */ + V_nd6_maxqueuelen = 1; /* max pkts cached in unresolved ND entries */ + +#ifdef ND6_DEBUG + V_nd6_debug = 1; +#else + V_nd6_debug = 0; +#endif + + V_nd6_recalc_reachtm_interval = ND6_RECALC_REACHTM_INTERVAL; + + V_dad_ignore_ns = 0; /* ignore NS in DAD - specwise incorrect*/ + V_dad_maxtry = 15; /* max # of *tries* to transmit DAD packet */ + + V_llinfo_nd6.ln_next = &V_llinfo_nd6; + V_llinfo_nd6.ln_prev = &V_llinfo_nd6; + LIST_INIT(&V_nd_prefix); + + ip6_use_tempaddr = 0; + ip6_temp_preferred_lifetime = DEF_TEMP_PREFERRED_LIFETIME; + ip6_temp_valid_lifetime = DEF_TEMP_VALID_LIFETIME; + ip6_temp_regen_advance = TEMPADDR_REGEN_ADVANCE; all1_sa.sin6_family = AF_INET6; all1_sa.sin6_len = sizeof(struct sockaddr_in6); for (i = 0; i < sizeof(all1_sa.sin6_addr); i++) all1_sa.sin6_addr.s6_addr[i] = 0xff; /* initialization of the default router list */ TAILQ_INIT(&V_nd_defrouter); /* start timer */ callout_init(&V_nd6_slowtimo_ch, 0); callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, NULL); nd6_init_done = 1; } struct nd_ifinfo * nd6_ifattach(struct ifnet *ifp) { struct nd_ifinfo *nd; nd = (struct nd_ifinfo *)malloc(sizeof(*nd), M_IP6NDP, M_WAITOK); bzero(nd, sizeof(*nd)); nd->initialized = 1; nd->chlim = IPV6_DEFHLIM; nd->basereachable = REACHABLE_TIME; nd->reachable = ND_COMPUTE_RTIME(nd->basereachable); nd->retrans = RETRANS_TIMER; /* * Note that the default value of ip6_accept_rtadv is 0, which means * we won't accept RAs by default even if we set ND6_IFF_ACCEPT_RTADV * here. */ nd->flags = (ND6_IFF_PERFORMNUD | ND6_IFF_ACCEPT_RTADV); /* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */ nd6_setmtu0(ifp, nd); return nd; } void nd6_ifdetach(struct nd_ifinfo *nd) { free(nd, M_IP6NDP); } /* * Reset ND level link MTU. This function is called when the physical MTU * changes, which means we might have to adjust the ND level MTU. */ void nd6_setmtu(struct ifnet *ifp) { nd6_setmtu0(ifp, ND_IFINFO(ifp)); } /* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */ void nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi) { INIT_VNET_INET6(ifp->if_vnet); u_int32_t omaxmtu; omaxmtu = ndi->maxmtu; switch (ifp->if_type) { case IFT_ARCNET: ndi->maxmtu = MIN(ARC_PHDS_MAXMTU, ifp->if_mtu); /* RFC2497 */ break; case IFT_FDDI: ndi->maxmtu = MIN(FDDIIPMTU, ifp->if_mtu); /* RFC2467 */ break; case IFT_ISO88025: ndi->maxmtu = MIN(ISO88025_MAX_MTU, ifp->if_mtu); break; default: ndi->maxmtu = ifp->if_mtu; break; } /* * Decreasing the interface MTU under IPV6 minimum MTU may cause * undesirable situation. We thus notify the operator of the change * explicitly. The check for omaxmtu is necessary to restrict the * log to the case of changing the MTU, not initializing it. */ if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) { log(LOG_NOTICE, "nd6_setmtu0: " "new link MTU on %s (%lu) is too small for IPv6\n", if_name(ifp), (unsigned long)ndi->maxmtu); } if (ndi->maxmtu > V_in6_maxmtu) in6_setmaxmtu(); /* check all interfaces just in case */ #undef MIN } void nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts) { bzero(ndopts, sizeof(*ndopts)); ndopts->nd_opts_search = (struct nd_opt_hdr *)opt; ndopts->nd_opts_last = (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len); if (icmp6len == 0) { ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } } /* * Take one ND option. */ struct nd_opt_hdr * nd6_option(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int olen; if (ndopts == NULL) panic("ndopts == NULL in nd6_option"); if (ndopts->nd_opts_last == NULL) panic("uninitialized ndopts in nd6_option"); if (ndopts->nd_opts_search == NULL) return NULL; if (ndopts->nd_opts_done) return NULL; nd_opt = ndopts->nd_opts_search; /* make sure nd_opt_len is inside the buffer */ if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) { bzero(ndopts, sizeof(*ndopts)); return NULL; } olen = nd_opt->nd_opt_len << 3; if (olen == 0) { /* * Message validation requires that all included * options have a length that is greater than zero. */ bzero(ndopts, sizeof(*ndopts)); return NULL; } ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen); if (ndopts->nd_opts_search > ndopts->nd_opts_last) { /* option overruns the end of buffer, invalid */ bzero(ndopts, sizeof(*ndopts)); return NULL; } else if (ndopts->nd_opts_search == ndopts->nd_opts_last) { /* reached the end of options chain */ ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } return nd_opt; } /* * Parse multiple ND options. * This function is much easier to use, for ND routines that do not need * multiple options of the same type. */ int nd6_options(union nd_opts *ndopts) { INIT_VNET_INET6(curvnet); struct nd_opt_hdr *nd_opt; int i = 0; if (ndopts == NULL) panic("ndopts == NULL in nd6_options"); if (ndopts->nd_opts_last == NULL) panic("uninitialized ndopts in nd6_options"); if (ndopts->nd_opts_search == NULL) return 0; while (1) { nd_opt = nd6_option(ndopts); if (nd_opt == NULL && ndopts->nd_opts_last == NULL) { /* * Message validation requires that all included * options have a length that is greater than zero. */ V_icmp6stat.icp6s_nd_badopt++; bzero(ndopts, sizeof(*ndopts)); return -1; } if (nd_opt == NULL) goto skip1; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LINKADDR: case ND_OPT_TARGET_LINKADDR: case ND_OPT_MTU: case ND_OPT_REDIRECTED_HEADER: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { nd6log((LOG_INFO, "duplicated ND6 option found (type=%d)\n", nd_opt->nd_opt_type)); /* XXX bark? */ } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFORMATION: if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } ndopts->nd_opts_pi_end = (struct nd_opt_prefix_info *)nd_opt; break; default: /* * Unknown options must be silently ignored, * to accomodate future extension to the protocol. */ nd6log((LOG_DEBUG, "nd6_options: unsupported option %d - " "option ignored\n", nd_opt->nd_opt_type)); } skip1: i++; if (i > V_nd6_maxndopt) { V_icmp6stat.icp6s_nd_toomanyopt++; nd6log((LOG_INFO, "too many loop in nd opt\n")); break; } if (ndopts->nd_opts_done) break; } return 0; } /* * ND6 timer routine to handle ND6 entries */ void nd6_llinfo_settimer(struct llinfo_nd6 *ln, long tick) { if (tick < 0) { ln->ln_expire = 0; ln->ln_ntick = 0; callout_stop(&ln->ln_timer_ch); } else { ln->ln_expire = time_second + tick / hz; if (tick > INT_MAX) { ln->ln_ntick = tick - INT_MAX; callout_reset(&ln->ln_timer_ch, INT_MAX, nd6_llinfo_timer, ln); } else { ln->ln_ntick = 0; callout_reset(&ln->ln_timer_ch, tick, nd6_llinfo_timer, ln); } } } static void nd6_llinfo_timer(void *arg) { struct llinfo_nd6 *ln; struct rtentry *rt; struct in6_addr *dst; struct ifnet *ifp; struct nd_ifinfo *ndi = NULL; ln = (struct llinfo_nd6 *)arg; if (ln->ln_ntick > 0) { if (ln->ln_ntick > INT_MAX) { ln->ln_ntick -= INT_MAX; nd6_llinfo_settimer(ln, INT_MAX); } else { ln->ln_ntick = 0; nd6_llinfo_settimer(ln, ln->ln_ntick); } return; } if ((rt = ln->ln_rt) == NULL) panic("ln->ln_rt == NULL"); if ((ifp = rt->rt_ifp) == NULL) panic("ln->ln_rt->rt_ifp == NULL"); ndi = ND_IFINFO(ifp); CURVNET_SET(ifp->if_vnet); INIT_VNET_INET6(curvnet); /* sanity check */ if (rt->rt_llinfo && (struct llinfo_nd6 *)rt->rt_llinfo != ln) panic("rt_llinfo(%p) is not equal to ln(%p)", rt->rt_llinfo, ln); if (rt_key(rt) == NULL) panic("rt key is NULL in nd6_timer(ln=%p)", ln); dst = &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr; switch (ln->ln_state) { case ND6_LLINFO_INCOMPLETE: if (ln->ln_asked < V_nd6_mmaxtries) { ln->ln_asked++; nd6_llinfo_settimer(ln, (long)ndi->retrans * hz / 1000); nd6_ns_output(ifp, NULL, dst, ln, 0); } else { struct mbuf *m = ln->ln_hold; if (m) { struct mbuf *m0; /* * assuming every packet in ln_hold has the * same IP header */ m0 = m->m_nextpkt; m->m_nextpkt = NULL; icmp6_error2(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0, rt->rt_ifp); ln->ln_hold = m0; clear_llinfo_pqueue(ln); } if (rt && rt->rt_llinfo) (void)nd6_free(rt, 0); ln = NULL; } break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(ln)) { ln->ln_state = ND6_LLINFO_STALE; nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); } break; case ND6_LLINFO_STALE: /* Garbage Collection(RFC 2461 5.3) */ if (!ND6_LLINFO_PERMANENT(ln)) { if (rt && rt->rt_llinfo) (void)nd6_free(rt, 1); ln = NULL; } break; case ND6_LLINFO_DELAY: if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) { /* We need NUD */ ln->ln_asked = 1; ln->ln_state = ND6_LLINFO_PROBE; nd6_llinfo_settimer(ln, (long)ndi->retrans * hz / 1000); nd6_ns_output(ifp, dst, dst, ln, 0); } else { ln->ln_state = ND6_LLINFO_STALE; /* XXX */ nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); } break; case ND6_LLINFO_PROBE: if (ln->ln_asked < V_nd6_umaxtries) { ln->ln_asked++; nd6_llinfo_settimer(ln, (long)ndi->retrans * hz / 1000); nd6_ns_output(ifp, dst, dst, ln, 0); } else if (rt->rt_ifa != NULL && rt->rt_ifa->ifa_addr->sa_family == AF_INET6 && (((struct in6_ifaddr *)rt->rt_ifa)->ia_flags & IFA_ROUTE)) { /* * This is an unreachable neighbor whose address is * specified as the destination of a p2p interface * (see in6_ifinit()). We should not free the entry * since this is sort of a "static" entry generated * via interface address configuration. */ ln->ln_asked = 0; ln->ln_expire = 0; /* make it permanent */ ln->ln_state = ND6_LLINFO_STALE; } else { if (rt && rt->rt_llinfo) (void)nd6_free(rt, 0); ln = NULL; } break; } CURVNET_RESTORE(); } /* * ND6 timer routine to expire default route list and prefix list */ void nd6_timer(void *arg) { CURVNET_SET_QUIET((struct vnet *) arg); INIT_VNET_INET6((struct vnet *) arg); int s; struct nd_defrouter *dr; struct nd_prefix *pr; struct in6_ifaddr *ia6, *nia6; struct in6_addrlifetime *lt6; callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz, nd6_timer, NULL); /* expire default router list */ s = splnet(); dr = TAILQ_FIRST(&V_nd_defrouter); while (dr) { if (dr->expire && dr->expire < time_second) { struct nd_defrouter *t; t = TAILQ_NEXT(dr, dr_entry); defrtrlist_del(dr); dr = t; } else { dr = TAILQ_NEXT(dr, dr_entry); } } /* * expire interface addresses. * in the past the loop was inside prefix expiry processing. * However, from a stricter speci-confrmance standpoint, we should * rather separate address lifetimes and prefix lifetimes. */ addrloop: for (ia6 = V_in6_ifaddr; ia6; ia6 = nia6) { nia6 = ia6->ia_next; /* check address lifetime */ lt6 = &ia6->ia6_lifetime; if (IFA6_IS_INVALID(ia6)) { int regen = 0; /* * If the expiring address is temporary, try * regenerating a new one. This would be useful when * we suspended a laptop PC, then turned it on after a * period that could invalidate all temporary * addresses. Although we may have to restart the * loop (see below), it must be after purging the * address. Otherwise, we'd see an infinite loop of * regeneration. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { if (regen_tmpaddr(ia6) == 0) regen = 1; } in6_purgeaddr(&ia6->ia_ifa); if (regen) goto addrloop; /* XXX: see below */ } else if (IFA6_IS_DEPRECATED(ia6)) { int oldflags = ia6->ia6_flags; ia6->ia6_flags |= IN6_IFF_DEPRECATED; /* * If a temporary address has just become deprecated, * regenerate a new one if possible. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (oldflags & IN6_IFF_DEPRECATED) == 0) { if (regen_tmpaddr(ia6) == 0) { /* * A new temporary address is * generated. * XXX: this means the address chain * has changed while we are still in * the loop. Although the change * would not cause disaster (because * it's not a deletion, but an * addition,) we'd rather restart the * loop just for safety. Or does this * significantly reduce performance?? */ goto addrloop; } } } else { /* * A new RA might have made a deprecated address * preferred. */ ia6->ia6_flags &= ~IN6_IFF_DEPRECATED; } } /* expire prefix list */ pr = V_nd_prefix.lh_first; while (pr) { /* * check prefix lifetime. * since pltime is just for autoconf, pltime processing for * prefix is not necessary. */ if (pr->ndpr_vltime != ND6_INFINITE_LIFETIME && time_second - pr->ndpr_lastupdate > pr->ndpr_vltime) { struct nd_prefix *t; t = pr->ndpr_next; /* * address expiration and prefix expiration are * separate. NEVER perform in6_purgeaddr here. */ prelist_remove(pr); pr = t; } else pr = pr->ndpr_next; } splx(s); CURVNET_RESTORE(); } /* * ia6 - deprecated/invalidated temporary address */ static int regen_tmpaddr(struct in6_ifaddr *ia6) { struct ifaddr *ifa; struct ifnet *ifp; struct in6_ifaddr *public_ifa6 = NULL; ifp = ia6->ia_ifa.ifa_ifp; for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { struct in6_ifaddr *it6; if (ifa->ifa_addr->sa_family != AF_INET6) continue; it6 = (struct in6_ifaddr *)ifa; /* ignore no autoconf addresses. */ if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; /* ignore autoconf addresses with different prefixes. */ if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr) continue; /* * Now we are looking at an autoconf address with the same * prefix as ours. If the address is temporary and is still * preferred, do not create another one. It would be rare, but * could happen, for example, when we resume a laptop PC after * a long period. */ if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && !IFA6_IS_DEPRECATED(it6)) { public_ifa6 = NULL; break; } /* * This is a public autoconf address that has the same prefix * as ours. If it is preferred, keep it. We can't break the * loop here, because there may be a still-preferred temporary * address with the prefix. */ if (!IFA6_IS_DEPRECATED(it6)) public_ifa6 = it6; } if (public_ifa6 != NULL) { int e; if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) { log(LOG_NOTICE, "regen_tmpaddr: failed to create a new" " tmp addr,errno=%d\n", e); return (-1); } return (0); } return (-1); } /* * Nuke neighbor cache/prefix/default router management table, right before * ifp goes away. */ void nd6_purge(struct ifnet *ifp) { INIT_VNET_INET6(ifp->if_vnet); struct llinfo_nd6 *ln, *nln; struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; /* * Nuke default router list entries toward ifp. * We defer removal of default router list entries that is installed * in the routing table, in order to keep additional side effects as * small as possible. */ for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = ndr) { ndr = TAILQ_NEXT(dr, dr_entry); if (dr->installed) continue; if (dr->ifp == ifp) defrtrlist_del(dr); } for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = ndr) { ndr = TAILQ_NEXT(dr, dr_entry); if (!dr->installed) continue; if (dr->ifp == ifp) defrtrlist_del(dr); } /* Nuke prefix list entries toward ifp */ for (pr = V_nd_prefix.lh_first; pr; pr = npr) { npr = pr->ndpr_next; if (pr->ndpr_ifp == ifp) { /* * Because if_detach() does *not* release prefixes * while purging addresses the reference count will * still be above zero. We therefore reset it to * make sure that the prefix really gets purged. */ pr->ndpr_refcnt = 0; /* * Previously, pr->ndpr_addr is removed as well, * but I strongly believe we don't have to do it. * nd6_purge() is only called from in6_ifdetach(), * which removes all the associated interface addresses * by itself. * (jinmei@kame.net 20010129) */ prelist_remove(pr); } } /* cancel default outgoing interface setting */ if (V_nd6_defifindex == ifp->if_index) nd6_setdefaultiface(0); if (!V_ip6_forwarding && V_ip6_accept_rtadv) { /* XXX: too restrictive? */ /* refresh default router list */ defrouter_select(); } /* * Nuke neighbor cache entries for the ifp. * Note that rt->rt_ifp may not be the same as ifp, * due to KAME goto ours hack. See RTM_RESOLVE case in * nd6_rtrequest(), and ip6_input(). */ ln = V_llinfo_nd6.ln_next; while (ln && ln != &V_llinfo_nd6) { struct rtentry *rt; struct sockaddr_dl *sdl; nln = ln->ln_next; rt = ln->ln_rt; if (rt && rt->rt_gateway && rt->rt_gateway->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)rt->rt_gateway; if (sdl->sdl_index == ifp->if_index) nln = nd6_free(rt, 0); } ln = nln; } } struct rtentry * nd6_lookup(struct in6_addr *addr6, int create, struct ifnet *ifp) { INIT_VNET_INET6(curvnet); struct rtentry *rt; struct sockaddr_in6 sin6; char ip6buf[INET6_ADDRSTRLEN]; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; rt = rtalloc1((struct sockaddr *)&sin6, create, 0UL); if (rt) { if ((rt->rt_flags & RTF_LLINFO) == 0 && create) { /* * This is the case for the default route. * If we want to create a neighbor cache for the * address, we should free the route for the * destination and allocate an interface route. */ RTFREE_LOCKED(rt); rt = NULL; } } if (rt == NULL) { if (create && ifp) { int e; /* * If no route is available and create is set, * we allocate a host route for the destination * and treat it like an interface route. * This hack is necessary for a neighbor which can't * be covered by our own prefix. */ struct ifaddr *ifa = ifaof_ifpforaddr((struct sockaddr *)&sin6, ifp); if (ifa == NULL) return (NULL); /* * Create a new route. RTF_LLINFO is necessary * to create a Neighbor Cache entry for the * destination in nd6_rtrequest which will be * called in rtrequest via ifa->ifa_rtrequest. */ if ((e = rtrequest(RTM_ADD, (struct sockaddr *)&sin6, ifa->ifa_addr, (struct sockaddr *)&all1_sa, (ifa->ifa_flags | RTF_HOST | RTF_LLINFO) & ~RTF_CLONING, &rt)) != 0) { log(LOG_ERR, "nd6_lookup: failed to add route for a " "neighbor(%s), errno=%d\n", ip6_sprintf(ip6buf, addr6), e); } if (rt == NULL) return (NULL); RT_LOCK(rt); if (rt->rt_llinfo) { struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo; ln->ln_state = ND6_LLINFO_NOSTATE; } } else return (NULL); } RT_LOCK_ASSERT(rt); RT_REMREF(rt); /* * Validation for the entry. * Note that the check for rt_llinfo is necessary because a cloned * route from a parent route that has the L flag (e.g. the default * route to a p2p interface) may have the flag, too, while the * destination is not actually a neighbor. * XXX: we can't use rt->rt_ifp to check for the interface, since * it might be the loopback interface if the entry is for our * own address on a non-loopback interface. Instead, we should * use rt->rt_ifa->ifa_ifp, which would specify the REAL * interface. * Note also that ifa_ifp and ifp may differ when we connect two * interfaces to a same link, install a link prefix to an interface, * and try to install a neighbor cache on an interface that does not * have a route to the prefix. */ if ((rt->rt_flags & RTF_GATEWAY) || (rt->rt_flags & RTF_LLINFO) == 0 || rt->rt_gateway->sa_family != AF_LINK || rt->rt_llinfo == NULL || (ifp && rt->rt_ifa->ifa_ifp != ifp)) { if (create) { nd6log((LOG_DEBUG, "nd6_lookup: failed to lookup %s (if = %s)\n", ip6_sprintf(ip6buf, addr6), ifp ? if_name(ifp) : "unspec")); } RT_UNLOCK(rt); return (NULL); } RT_UNLOCK(rt); /* XXX not ready to return rt locked */ return (rt); } /* * Test whether a given IPv6 address is a neighbor or not, ignoring * the actual neighbor cache. The neighbor cache is ignored in order * to not reenter the routing code from within itself. */ static int nd6_is_new_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp) { INIT_VNET_INET6(ifp->if_vnet); struct nd_prefix *pr; struct ifaddr *dstaddr; /* * A link-local address is always a neighbor. * XXX: a link does not necessarily specify a single interface. */ if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) { struct sockaddr_in6 sin6_copy; u_int32_t zone; /* * We need sin6_copy since sa6_recoverscope() may modify the * content (XXX). */ sin6_copy = *addr; if (sa6_recoverscope(&sin6_copy)) return (0); /* XXX: should be impossible */ if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) return (0); if (sin6_copy.sin6_scope_id == zone) return (1); else return (0); } /* * If the address matches one of our addresses, * it should be a neighbor. * If the address matches one of our on-link prefixes, it should be a * neighbor. */ for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) { if (pr->ndpr_ifp != ifp) continue; if (!(pr->ndpr_stateflags & NDPRF_ONLINK)) continue; if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, &addr->sin6_addr, &pr->ndpr_mask)) return (1); } /* * If the address is assigned on the node of the other side of * a p2p interface, the address should be a neighbor. */ dstaddr = ifa_ifwithdstaddr((struct sockaddr *)addr); if ((dstaddr != NULL) && (dstaddr->ifa_ifp == ifp)) return (1); /* * If the default router list is empty, all addresses are regarded * as on-link, and thus, as a neighbor. * XXX: we restrict the condition to hosts, because routers usually do * not have the "default router list". */ if (!V_ip6_forwarding && TAILQ_FIRST(&V_nd_defrouter) == NULL && V_nd6_defifindex == ifp->if_index) { return (1); } return (0); } /* * Detect if a given IPv6 address identifies a neighbor on a given link. * XXX: should take care of the destination of a p2p link? */ int nd6_is_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp) { if (nd6_is_new_addr_neighbor(addr, ifp)) return (1); /* * Even if the address matches none of our addresses, it might be * in the neighbor cache. */ if (nd6_lookup(&addr->sin6_addr, 0, ifp) != NULL) return (1); return (0); } /* * Free an nd6 llinfo entry. * Since the function would cause significant changes in the kernel, DO NOT * make it global, unless you have a strong reason for the change, and are sure * that the change is safe. */ static struct llinfo_nd6 * nd6_free(struct rtentry *rt, int gc) { INIT_VNET_INET6(curvnet); struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo, *next; struct in6_addr in6 = ((struct sockaddr_in6 *)rt_key(rt))->sin6_addr; struct nd_defrouter *dr; /* * we used to have pfctlinput(PRC_HOSTDEAD) here. * even though it is not harmful, it was not really necessary. */ /* cancel timer */ nd6_llinfo_settimer(ln, -1); if (!V_ip6_forwarding) { int s; s = splnet(); dr = defrouter_lookup(&((struct sockaddr_in6 *)rt_key(rt))->sin6_addr, rt->rt_ifp); if (dr != NULL && dr->expire && ln->ln_state == ND6_LLINFO_STALE && gc) { /* * If the reason for the deletion is just garbage * collection, and the neighbor is an active default * router, do not delete it. Instead, reset the GC * timer using the router's lifetime. * Simply deleting the entry would affect default * router selection, which is not necessarily a good * thing, especially when we're using router preference * values. * XXX: the check for ln_state would be redundant, * but we intentionally keep it just in case. */ if (dr->expire > time_second) nd6_llinfo_settimer(ln, (dr->expire - time_second) * hz); else nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); splx(s); return (ln->ln_next); } if (ln->ln_router || dr) { /* * rt6_flush must be called whether or not the neighbor * is in the Default Router List. * See a corresponding comment in nd6_na_input(). */ rt6_flush(&in6, rt->rt_ifp); } if (dr) { /* * Unreachablity of a router might affect the default * router selection and on-link detection of advertised * prefixes. */ /* * Temporarily fake the state to choose a new default * router and to perform on-link determination of * prefixes correctly. * Below the state will be set correctly, * or the entry itself will be deleted. */ ln->ln_state = ND6_LLINFO_INCOMPLETE; /* * Since defrouter_select() does not affect the * on-link determination and MIP6 needs the check * before the default router selection, we perform * the check now. */ pfxlist_onlink_check(); /* * refresh default router list */ defrouter_select(); } splx(s); } /* * Before deleting the entry, remember the next entry as the * return value. We need this because pfxlist_onlink_check() above * might have freed other entries (particularly the old next entry) as * a side effect (XXX). */ next = ln->ln_next; /* * Detach the route from the routing tree and the list of neighbor * caches, and disable the route entry not to be used in already * cached routes. */ rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, rt_mask(rt), 0, (struct rtentry **)0); return (next); } /* * Upper-layer reachability hint for Neighbor Unreachability Detection. * * XXX cost-effective methods? */ void nd6_nud_hint(struct rtentry *rt, struct in6_addr *dst6, int force) { INIT_VNET_INET6(curvnet); struct llinfo_nd6 *ln; /* * If the caller specified "rt", use that. Otherwise, resolve the * routing table by supplied "dst6". */ if (rt == NULL) { if (dst6 == NULL) return; if ((rt = nd6_lookup(dst6, 0, NULL)) == NULL) return; } if ((rt->rt_flags & RTF_GATEWAY) != 0 || (rt->rt_flags & RTF_LLINFO) == 0 || rt->rt_llinfo == NULL || rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_LINK) { /* This is not a host route. */ return; } ln = (struct llinfo_nd6 *)rt->rt_llinfo; if (ln->ln_state < ND6_LLINFO_REACHABLE) return; /* * if we get upper-layer reachability confirmation many times, * it is possible we have false information. */ if (!force) { ln->ln_byhint++; if (ln->ln_byhint > V_nd6_maxnudhint) return; } ln->ln_state = ND6_LLINFO_REACHABLE; if (!ND6_LLINFO_PERMANENT(ln)) { nd6_llinfo_settimer(ln, (long)ND_IFINFO(rt->rt_ifp)->reachable * hz); } } /* * info - XXX unused */ void nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info) { struct sockaddr *gate = rt->rt_gateway; struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; struct ifnet *ifp = rt->rt_ifp; struct ifaddr *ifa; INIT_VNET_NET(ifp->if_vnet); INIT_VNET_INET6(ifp->if_vnet); RT_LOCK_ASSERT(rt); if ((rt->rt_flags & RTF_GATEWAY) != 0) return; if (nd6_need_cache(ifp) == 0 && (rt->rt_flags & RTF_HOST) == 0) { /* * This is probably an interface direct route for a link * which does not need neighbor caches (e.g. fe80::%lo0/64). * We do not need special treatment below for such a route. * Moreover, the RTF_LLINFO flag which would be set below * would annoy the ndp(8) command. */ return; } if (req == RTM_RESOLVE && (nd6_need_cache(ifp) == 0 || /* stf case */ !nd6_is_new_addr_neighbor((struct sockaddr_in6 *)rt_key(rt), ifp))) { /* * FreeBSD and BSD/OS often make a cloned host route based * on a less-specific route (e.g. the default route). * If the less specific route does not have a "gateway" * (this is the case when the route just goes to a p2p or an * stf interface), we'll mistakenly make a neighbor cache for * the host route, and will see strange neighbor solicitation * for the corresponding destination. In order to avoid the * confusion, we check if the destination of the route is * a neighbor in terms of neighbor discovery, and stop the * process if not. Additionally, we remove the LLINFO flag * so that ndp(8) will not try to get the neighbor information * of the destination. */ rt->rt_flags &= ~RTF_LLINFO; return; } switch (req) { case RTM_ADD: /* * There is no backward compatibility :) * * if ((rt->rt_flags & RTF_HOST) == 0 && * SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff) * rt->rt_flags |= RTF_CLONING; */ if ((rt->rt_flags & RTF_CLONING) || ((rt->rt_flags & RTF_LLINFO) && ln == NULL)) { /* * Case 1: This route should come from a route to * interface (RTF_CLONING case) or the route should be * treated as on-link but is currently not * (RTF_LLINFO && ln == NULL case). */ rt_setgate(rt, rt_key(rt), (struct sockaddr *)&null_sdl); gate = rt->rt_gateway; SDL(gate)->sdl_type = ifp->if_type; SDL(gate)->sdl_index = ifp->if_index; if (ln) nd6_llinfo_settimer(ln, 0); if ((rt->rt_flags & RTF_CLONING) != 0) break; } /* * In IPv4 code, we try to annonuce new RTF_ANNOUNCE entry here. * We don't do that here since llinfo is not ready yet. * * There are also couple of other things to be discussed: * - unsolicited NA code needs improvement beforehand * - RFC2461 says we MAY send multicast unsolicited NA * (7.2.6 paragraph 4), however, it also says that we * SHOULD provide a mechanism to prevent multicast NA storm. * we don't have anything like it right now. * note that the mechanism needs a mutual agreement * between proxies, which means that we need to implement * a new protocol, or a new kludge. * - from RFC2461 6.2.4, host MUST NOT send an unsolicited NA. * we need to check ip6forwarding before sending it. * (or should we allow proxy ND configuration only for * routers? there's no mention about proxy ND from hosts) */ /* FALLTHROUGH */ case RTM_RESOLVE: if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) == 0) { /* * Address resolution isn't necessary for a point to * point link, so we can skip this test for a p2p link. */ if (gate->sa_family != AF_LINK || gate->sa_len < sizeof(null_sdl)) { log(LOG_DEBUG, "nd6_rtrequest: bad gateway value: %s\n", if_name(ifp)); break; } SDL(gate)->sdl_type = ifp->if_type; SDL(gate)->sdl_index = ifp->if_index; } if (ln != NULL) break; /* This happens on a route change */ /* * Case 2: This route may come from cloning, or a manual route * add with a LL address. */ R_Malloc(ln, struct llinfo_nd6 *, sizeof(*ln)); rt->rt_llinfo = (caddr_t)ln; if (ln == NULL) { log(LOG_DEBUG, "nd6_rtrequest: malloc failed\n"); break; } V_nd6_inuse++; V_nd6_allocated++; bzero(ln, sizeof(*ln)); RT_ADDREF(rt); ln->ln_rt = rt; callout_init(&ln->ln_timer_ch, 0); /* this is required for "ndp" command. - shin */ if (req == RTM_ADD) { /* * gate should have some valid AF_LINK entry, * and ln->ln_expire should have some lifetime * which is specified by ndp command. */ ln->ln_state = ND6_LLINFO_REACHABLE; ln->ln_byhint = 0; } else { /* * When req == RTM_RESOLVE, rt is created and * initialized in rtrequest(), so rt_expire is 0. */ ln->ln_state = ND6_LLINFO_NOSTATE; nd6_llinfo_settimer(ln, 0); } rt->rt_flags |= RTF_LLINFO; ln->ln_next = V_llinfo_nd6.ln_next; V_llinfo_nd6.ln_next = ln; ln->ln_prev = &V_llinfo_nd6; ln->ln_next->ln_prev = ln; /* * check if rt_key(rt) is one of my address assigned * to the interface. */ ifa = (struct ifaddr *)in6ifa_ifpwithaddr(rt->rt_ifp, &SIN6(rt_key(rt))->sin6_addr); if (ifa) { caddr_t macp = nd6_ifptomac(ifp); nd6_llinfo_settimer(ln, -1); ln->ln_state = ND6_LLINFO_REACHABLE; ln->ln_byhint = 0; if (macp) { bcopy(macp, LLADDR(SDL(gate)), ifp->if_addrlen); SDL(gate)->sdl_alen = ifp->if_addrlen; } if (V_nd6_useloopback) { rt->rt_ifp = &V_loif[0]; /* XXX */ /* * Make sure rt_ifa be equal to the ifaddr * corresponding to the address. * We need this because when we refer * rt_ifa->ia6_flags in ip6_input, we assume * that the rt_ifa points to the address instead * of the loopback address. */ if (ifa != rt->rt_ifa) { IFAFREE(rt->rt_ifa); IFAREF(ifa); rt->rt_ifa = ifa; } } } else if (rt->rt_flags & RTF_ANNOUNCE) { nd6_llinfo_settimer(ln, -1); ln->ln_state = ND6_LLINFO_REACHABLE; ln->ln_byhint = 0; /* join solicited node multicast for proxy ND */ if (ifp->if_flags & IFF_MULTICAST) { struct in6_addr llsol; int error; llsol = SIN6(rt_key(rt))->sin6_addr; llsol.s6_addr32[0] = IPV6_ADDR_INT32_MLL; llsol.s6_addr32[1] = 0; llsol.s6_addr32[2] = htonl(1); llsol.s6_addr8[12] = 0xff; if (in6_setscope(&llsol, ifp, NULL)) break; if (in6_addmulti(&llsol, ifp, &error, 0) == NULL) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_ERR, "%s: failed to join " "%s (errno=%d)\n", if_name(ifp), ip6_sprintf(ip6buf, &llsol), error)); } } } break; case RTM_DELETE: if (ln == NULL) break; /* leave from solicited node multicast for proxy ND */ if ((rt->rt_flags & RTF_ANNOUNCE) != 0 && (ifp->if_flags & IFF_MULTICAST) != 0) { struct in6_addr llsol; struct in6_multi *in6m; llsol = SIN6(rt_key(rt))->sin6_addr; llsol.s6_addr32[0] = IPV6_ADDR_INT32_MLL; llsol.s6_addr32[1] = 0; llsol.s6_addr32[2] = htonl(1); llsol.s6_addr8[12] = 0xff; if (in6_setscope(&llsol, ifp, NULL) == 0) { IN6_LOOKUP_MULTI(llsol, ifp, in6m); if (in6m) in6_delmulti(in6m); } else ; /* XXX: should not happen. bark here? */ } V_nd6_inuse--; ln->ln_next->ln_prev = ln->ln_prev; ln->ln_prev->ln_next = ln->ln_next; ln->ln_prev = NULL; nd6_llinfo_settimer(ln, -1); RT_REMREF(rt); rt->rt_llinfo = 0; rt->rt_flags &= ~RTF_LLINFO; clear_llinfo_pqueue(ln); Free((caddr_t)ln); } } int nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { INIT_VNET_INET6(ifp->if_vnet); struct in6_drlist *drl = (struct in6_drlist *)data; struct in6_oprlist *oprl = (struct in6_oprlist *)data; struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data; struct in6_ndifreq *ndif = (struct in6_ndifreq *)data; struct nd_defrouter *dr; struct nd_prefix *pr; struct rtentry *rt; int i = 0, error = 0; int s; switch (cmd) { case SIOCGDRLST_IN6: /* * obsolete API, use sysctl under net.inet6.icmp6 */ bzero(drl, sizeof(*drl)); s = splnet(); dr = TAILQ_FIRST(&V_nd_defrouter); while (dr && i < DRLSTSIZ) { drl->defrouter[i].rtaddr = dr->rtaddr; in6_clearscope(&drl->defrouter[i].rtaddr); drl->defrouter[i].flags = dr->flags; drl->defrouter[i].rtlifetime = dr->rtlifetime; drl->defrouter[i].expire = dr->expire; drl->defrouter[i].if_index = dr->ifp->if_index; i++; dr = TAILQ_NEXT(dr, dr_entry); } splx(s); break; case SIOCGPRLST_IN6: /* * obsolete API, use sysctl under net.inet6.icmp6 * * XXX the structure in6_prlist was changed in backward- * incompatible manner. in6_oprlist is used for SIOCGPRLST_IN6, * in6_prlist is used for nd6_sysctl() - fill_prlist(). */ /* * XXX meaning of fields, especialy "raflags", is very * differnet between RA prefix list and RR/static prefix list. * how about separating ioctls into two? */ bzero(oprl, sizeof(*oprl)); s = splnet(); pr = V_nd_prefix.lh_first; while (pr && i < PRLSTSIZ) { struct nd_pfxrouter *pfr; int j; oprl->prefix[i].prefix = pr->ndpr_prefix.sin6_addr; oprl->prefix[i].raflags = pr->ndpr_raf; oprl->prefix[i].prefixlen = pr->ndpr_plen; oprl->prefix[i].vltime = pr->ndpr_vltime; oprl->prefix[i].pltime = pr->ndpr_pltime; oprl->prefix[i].if_index = pr->ndpr_ifp->if_index; if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME) oprl->prefix[i].expire = 0; else { time_t maxexpire; /* XXX: we assume time_t is signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate) { oprl->prefix[i].expire = pr->ndpr_lastupdate + pr->ndpr_vltime; } else oprl->prefix[i].expire = maxexpire; } pfr = pr->ndpr_advrtrs.lh_first; j = 0; while (pfr) { if (j < DRLSTSIZ) { #define RTRADDR oprl->prefix[i].advrtr[j] RTRADDR = pfr->router->rtaddr; in6_clearscope(&RTRADDR); #undef RTRADDR } j++; pfr = pfr->pfr_next; } oprl->prefix[i].advrtrs = j; oprl->prefix[i].origin = PR_ORIG_RA; i++; pr = pr->ndpr_next; } splx(s); break; case OSIOCGIFINFO_IN6: #define ND ndi->ndi /* XXX: old ndp(8) assumes a positive value for linkmtu. */ bzero(&ND, sizeof(ND)); ND.linkmtu = IN6_LINKMTU(ifp); ND.maxmtu = ND_IFINFO(ifp)->maxmtu; ND.basereachable = ND_IFINFO(ifp)->basereachable; ND.reachable = ND_IFINFO(ifp)->reachable; ND.retrans = ND_IFINFO(ifp)->retrans; ND.flags = ND_IFINFO(ifp)->flags; ND.recalctm = ND_IFINFO(ifp)->recalctm; ND.chlim = ND_IFINFO(ifp)->chlim; break; case SIOCGIFINFO_IN6: ND = *ND_IFINFO(ifp); break; case SIOCSIFINFO_IN6: /* * used to change host variables from userland. * intented for a use on router to reflect RA configurations. */ /* 0 means 'unspecified' */ if (ND.linkmtu != 0) { if (ND.linkmtu < IPV6_MMTU || ND.linkmtu > IN6_LINKMTU(ifp)) { error = EINVAL; break; } ND_IFINFO(ifp)->linkmtu = ND.linkmtu; } if (ND.basereachable != 0) { int obasereachable = ND_IFINFO(ifp)->basereachable; ND_IFINFO(ifp)->basereachable = ND.basereachable; if (ND.basereachable != obasereachable) ND_IFINFO(ifp)->reachable = ND_COMPUTE_RTIME(ND.basereachable); } if (ND.retrans != 0) ND_IFINFO(ifp)->retrans = ND.retrans; if (ND.chlim != 0) ND_IFINFO(ifp)->chlim = ND.chlim; /* FALLTHROUGH */ case SIOCSIFINFO_FLAGS: ND_IFINFO(ifp)->flags = ND.flags; break; #undef ND case SIOCSNDFLUSH_IN6: /* XXX: the ioctl name is confusing... */ /* sync kernel routing table with the default router list */ defrouter_reset(); defrouter_select(); break; case SIOCSPFXFLUSH_IN6: { /* flush all the prefix advertised by routers */ struct nd_prefix *pr, *next; s = splnet(); for (pr = V_nd_prefix.lh_first; pr; pr = next) { struct in6_ifaddr *ia, *ia_next; next = pr->ndpr_next; if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; /* XXX */ /* do we really have to remove addresses as well? */ for (ia = V_in6_ifaddr; ia; ia = ia_next) { /* ia might be removed. keep the next ptr. */ ia_next = ia->ia_next; if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ia->ia6_ndpr == pr) in6_purgeaddr(&ia->ia_ifa); } prelist_remove(pr); } splx(s); break; } case SIOCSRTRFLUSH_IN6: { /* flush all the default routers */ struct nd_defrouter *dr, *next; s = splnet(); defrouter_reset(); for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = next) { next = TAILQ_NEXT(dr, dr_entry); defrtrlist_del(dr); } defrouter_select(); splx(s); break; } case SIOCGNBRINFO_IN6: { struct llinfo_nd6 *ln; struct in6_addr nb_addr = nbi->addr; /* make local for safety */ if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0) return (error); s = splnet(); if ((rt = nd6_lookup(&nb_addr, 0, ifp)) == NULL) { error = EINVAL; splx(s); break; } ln = (struct llinfo_nd6 *)rt->rt_llinfo; nbi->state = ln->ln_state; nbi->asked = ln->ln_asked; nbi->isrouter = ln->ln_router; nbi->expire = ln->ln_expire; splx(s); break; } case SIOCGDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ ndif->ifindex = V_nd6_defifindex; break; case SIOCSDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ return (nd6_setdefaultiface(ndif->ifindex)); } return (error); } /* * Create neighbor cache entry and cache link-layer address, * on reception of inbound ND6 packets. (RS/RA/NS/redirect) * * type - ICMP6 type * code - type dependent information */ struct rtentry * nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, int lladdrlen, int type, int code) { INIT_VNET_INET6(curvnet); struct rtentry *rt = NULL; struct llinfo_nd6 *ln = NULL; int is_newentry; struct sockaddr_dl *sdl = NULL; int do_update; int olladdr; int llchange; int newstate = 0; if (ifp == NULL) panic("ifp == NULL in nd6_cache_lladdr"); if (from == NULL) panic("from == NULL in nd6_cache_lladdr"); /* nothing must be updated for unspecified address */ if (IN6_IS_ADDR_UNSPECIFIED(from)) return NULL; /* * Validation about ifp->if_addrlen and lladdrlen must be done in * the caller. * * XXX If the link does not have link-layer adderss, what should * we do? (ifp->if_addrlen == 0) * Spec says nothing in sections for RA, RS and NA. There's small * description on it in NS section (RFC 2461 7.2.3). */ rt = nd6_lookup(from, 0, ifp); if (rt == NULL) { rt = nd6_lookup(from, 1, ifp); is_newentry = 1; } else { /* do nothing if static ndp is set */ if (rt->rt_flags & RTF_STATIC) return NULL; is_newentry = 0; } if (rt == NULL) return NULL; if ((rt->rt_flags & (RTF_GATEWAY | RTF_LLINFO)) != RTF_LLINFO) { fail: (void)nd6_free(rt, 0); return NULL; } ln = (struct llinfo_nd6 *)rt->rt_llinfo; if (ln == NULL) goto fail; if (rt->rt_gateway == NULL) goto fail; if (rt->rt_gateway->sa_family != AF_LINK) goto fail; sdl = SDL(rt->rt_gateway); olladdr = (sdl->sdl_alen) ? 1 : 0; if (olladdr && lladdr) { if (bcmp(lladdr, LLADDR(sdl), ifp->if_addrlen)) llchange = 1; else llchange = 0; } else llchange = 0; /* * newentry olladdr lladdr llchange (*=record) * 0 n n -- (1) * 0 y n -- (2) * 0 n y -- (3) * STALE * 0 y y n (4) * * 0 y y y (5) * STALE * 1 -- n -- (6) NOSTATE(= PASSIVE) * 1 -- y -- (7) * STALE */ if (lladdr) { /* (3-5) and (7) */ /* * Record source link-layer address * XXX is it dependent to ifp->if_type? */ sdl->sdl_alen = ifp->if_addrlen; bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen); } if (!is_newentry) { if ((!olladdr && lladdr != NULL) || /* (3) */ (olladdr && lladdr != NULL && llchange)) { /* (5) */ do_update = 1; newstate = ND6_LLINFO_STALE; } else /* (1-2,4) */ do_update = 0; } else { do_update = 1; if (lladdr == NULL) /* (6) */ newstate = ND6_LLINFO_NOSTATE; else /* (7) */ newstate = ND6_LLINFO_STALE; } if (do_update) { /* * Update the state of the neighbor cache. */ ln->ln_state = newstate; if (ln->ln_state == ND6_LLINFO_STALE) { /* * XXX: since nd6_output() below will cause * state tansition to DELAY and reset the timer, * we must set the timer now, although it is actually * meaningless. */ nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); if (ln->ln_hold) { struct mbuf *m_hold, *m_hold_next; /* * reset the ln_hold in advance, to explicitly * prevent a ln_hold lookup in nd6_output() * (wouldn't happen, though...) */ for (m_hold = ln->ln_hold, ln->ln_hold = NULL; m_hold; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_hold->m_nextpkt = NULL; /* * we assume ifp is not a p2p here, so * just set the 2nd argument as the * 1st one. */ nd6_output(ifp, ifp, m_hold, (struct sockaddr_in6 *)rt_key(rt), rt); } } } else if (ln->ln_state == ND6_LLINFO_INCOMPLETE) { /* probe right away */ nd6_llinfo_settimer((void *)ln, 0); } } /* * ICMP6 type dependent behavior. * * NS: clear IsRouter if new entry * RS: clear IsRouter * RA: set IsRouter if there's lladdr * redir: clear IsRouter if new entry * * RA case, (1): * The spec says that we must set IsRouter in the following cases: * - If lladdr exist, set IsRouter. This means (1-5). * - If it is old entry (!newentry), set IsRouter. This means (7). * So, based on the spec, in (1-5) and (7) cases we must set IsRouter. * A quetion arises for (1) case. (1) case has no lladdr in the * neighbor cache, this is similar to (6). * This case is rare but we figured that we MUST NOT set IsRouter. * * newentry olladdr lladdr llchange NS RS RA redir * D R * 0 n n -- (1) c ? s * 0 y n -- (2) c s s * 0 n y -- (3) c s s * 0 y y n (4) c s s * 0 y y y (5) c s s * 1 -- n -- (6) c c c s * 1 -- y -- (7) c c s c s * * (c=clear s=set) */ switch (type & 0xff) { case ND_NEIGHBOR_SOLICIT: /* * New entry must have is_router flag cleared. */ if (is_newentry) /* (6-7) */ ln->ln_router = 0; break; case ND_REDIRECT: /* * If the icmp is a redirect to a better router, always set the * is_router flag. Otherwise, if the entry is newly created, * clear the flag. [RFC 2461, sec 8.3] */ if (code == ND_REDIRECT_ROUTER) ln->ln_router = 1; else if (is_newentry) /* (6-7) */ ln->ln_router = 0; break; case ND_ROUTER_SOLICIT: /* * is_router flag must always be cleared. */ ln->ln_router = 0; break; case ND_ROUTER_ADVERT: /* * Mark an entry with lladdr as a router. */ if ((!is_newentry && (olladdr || lladdr)) || /* (2-5) */ (is_newentry && lladdr)) { /* (7) */ ln->ln_router = 1; } break; } /* * When the link-layer address of a router changes, select the * best router again. In particular, when the neighbor entry is newly * created, it might affect the selection policy. * Question: can we restrict the first condition to the "is_newentry" * case? * XXX: when we hear an RA from a new router with the link-layer * address option, defrouter_select() is called twice, since * defrtrlist_update called the function as well. However, I believe * we can compromise the overhead, since it only happens the first * time. * XXX: although defrouter_select() should not have a bad effect * for those are not autoconfigured hosts, we explicitly avoid such * cases for safety. */ if (do_update && ln->ln_router && !V_ip6_forwarding && V_ip6_accept_rtadv) defrouter_select(); return rt; } static void nd6_slowtimo(void *arg) { CURVNET_SET((struct vnet *) arg); INIT_VNET_NET((struct vnet *) arg); INIT_VNET_INET6((struct vnet *) arg); struct nd_ifinfo *nd6if; struct ifnet *ifp; callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, NULL); IFNET_RLOCK(); for (ifp = TAILQ_FIRST(&V_ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { nd6if = ND_IFINFO(ifp); if (nd6if->basereachable && /* already initialized */ (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) { /* * Since reachable time rarely changes by router * advertisements, we SHOULD insure that a new random * value gets recomputed at least once every few hours. * (RFC 2461, 6.3.4) */ nd6if->recalctm = V_nd6_recalc_reachtm_interval; nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable); } } IFNET_RUNLOCK(); CURVNET_RESTORE(); } #define senderr(e) { error = (e); goto bad;} int nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, struct sockaddr_in6 *dst, struct rtentry *rt0) { INIT_VNET_INET6(curvnet); struct mbuf *m = m0; struct rtentry *rt = rt0; struct sockaddr_in6 *gw6 = NULL; struct llinfo_nd6 *ln = NULL; int error = 0; if (IN6_IS_ADDR_MULTICAST(&dst->sin6_addr)) goto sendpkt; if (nd6_need_cache(ifp) == 0) goto sendpkt; /* * next hop determination. This routine is derived from ether_output. */ /* NB: the locking here is tortuous... */ if (rt != NULL) RT_LOCK(rt); again: if (rt != NULL) { if ((rt->rt_flags & RTF_UP) == 0) { RT_UNLOCK(rt); rt0 = rt = rtalloc1((struct sockaddr *)dst, 1, 0UL); if (rt != NULL) { RT_REMREF(rt); if (rt->rt_ifp != ifp) /* * XXX maybe we should update ifp too, * but the original code didn't and I * don't know what is correct here. */ goto again; } else senderr(EHOSTUNREACH); } if (rt->rt_flags & RTF_GATEWAY) { gw6 = (struct sockaddr_in6 *)rt->rt_gateway; /* * We skip link-layer address resolution and NUD * if the gateway is not a neighbor from ND point * of view, regardless of the value of nd_ifinfo.flags. * The second condition is a bit tricky; we skip * if the gateway is our own address, which is * sometimes used to install a route to a p2p link. */ if (!nd6_is_addr_neighbor(gw6, ifp) || in6ifa_ifpwithaddr(ifp, &gw6->sin6_addr)) { RT_UNLOCK(rt); /* * We allow this kind of tricky route only * when the outgoing interface is p2p. * XXX: we may need a more generic rule here. */ if ((ifp->if_flags & IFF_POINTOPOINT) == 0) senderr(EHOSTUNREACH); goto sendpkt; } if (rt->rt_gwroute == NULL) goto lookup; rt = rt->rt_gwroute; RT_LOCK(rt); /* NB: gwroute */ if ((rt->rt_flags & RTF_UP) == 0) { RTFREE_LOCKED(rt); /* unlock gwroute */ rt = rt0; rt0->rt_gwroute = NULL; lookup: RT_UNLOCK(rt0); rt = rtalloc1(rt->rt_gateway, 1, 0UL); if (rt == rt0) { RT_REMREF(rt0); RT_UNLOCK(rt0); senderr(EHOSTUNREACH); } RT_LOCK(rt0); if (rt0->rt_gwroute != NULL) RTFREE(rt0->rt_gwroute); rt0->rt_gwroute = rt; if (rt == NULL) { RT_UNLOCK(rt0); senderr(EHOSTUNREACH); } } RT_UNLOCK(rt0); } RT_UNLOCK(rt); } /* * Address resolution or Neighbor Unreachability Detection * for the next hop. * At this point, the destination of the packet must be a unicast * or an anycast address(i.e. not a multicast). */ /* Look up the neighbor cache for the nexthop */ if (rt && (rt->rt_flags & RTF_LLINFO) != 0) ln = (struct llinfo_nd6 *)rt->rt_llinfo; else { /* * Since nd6_is_addr_neighbor() internally calls nd6_lookup(), * the condition below is not very efficient. But we believe * it is tolerable, because this should be a rare case. */ if (nd6_is_addr_neighbor(dst, ifp) && (rt = nd6_lookup(&dst->sin6_addr, 1, ifp)) != NULL) ln = (struct llinfo_nd6 *)rt->rt_llinfo; } if (ln == NULL || rt == NULL) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0 && !(ND_IFINFO(ifp)->flags & ND6_IFF_PERFORMNUD)) { char ip6buf[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "nd6_output: can't allocate llinfo for %s " "(ln=%p, rt=%p)\n", ip6_sprintf(ip6buf, &dst->sin6_addr), ln, rt); senderr(EIO); /* XXX: good error? */ } goto sendpkt; /* send anyway */ } /* We don't have to do link-layer address resolution on a p2p link. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ln->ln_state < ND6_LLINFO_REACHABLE) { ln->ln_state = ND6_LLINFO_STALE; nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); } /* * The first time we send a packet to a neighbor whose entry is * STALE, we have to change the state to DELAY and a sets a timer to * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do * neighbor unreachability detection on expiration. * (RFC 2461 7.3.3) */ if (ln->ln_state == ND6_LLINFO_STALE) { ln->ln_asked = 0; ln->ln_state = ND6_LLINFO_DELAY; nd6_llinfo_settimer(ln, (long)V_nd6_delay * hz); } /* * If the neighbor cache entry has a state other than INCOMPLETE * (i.e. its link-layer address is already resolved), just * send the packet. */ if (ln->ln_state > ND6_LLINFO_INCOMPLETE) goto sendpkt; /* * There is a neighbor cache entry, but no ethernet address * response yet. Append this latest packet to the end of the * packet queue in the mbuf, unless the number of the packet * does not exceed nd6_maxqueuelen. When it exceeds nd6_maxqueuelen, * the oldest packet in the queue will be removed. */ if (ln->ln_state == ND6_LLINFO_NOSTATE) ln->ln_state = ND6_LLINFO_INCOMPLETE; if (ln->ln_hold) { struct mbuf *m_hold; int i; i = 0; for (m_hold = ln->ln_hold; m_hold; m_hold = m_hold->m_nextpkt) { i++; if (m_hold->m_nextpkt == NULL) { m_hold->m_nextpkt = m; break; } } while (i >= V_nd6_maxqueuelen) { m_hold = ln->ln_hold; ln->ln_hold = ln->ln_hold->m_nextpkt; m_freem(m_hold); i--; } } else { ln->ln_hold = m; } /* * If there has been no NS for the neighbor after entering the * INCOMPLETE state, send the first solicitation. */ if (!ND6_LLINFO_PERMANENT(ln) && ln->ln_asked == 0) { ln->ln_asked++; nd6_llinfo_settimer(ln, (long)ND_IFINFO(ifp)->retrans * hz / 1000); nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, 0); } return (0); sendpkt: /* discard the packet if IPv6 operation is disabled on the interface */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) { error = ENETDOWN; /* better error? */ goto bad; } #ifdef MAC mac_netinet6_nd6_send(ifp, m); #endif if ((ifp->if_flags & IFF_LOOPBACK) != 0) { return ((*ifp->if_output)(origifp, m, (struct sockaddr *)dst, rt)); } return ((*ifp->if_output)(ifp, m, (struct sockaddr *)dst, rt)); bad: if (m) m_freem(m); return (error); } #undef senderr int nd6_need_cache(struct ifnet *ifp) { /* * XXX: we currently do not make neighbor cache on any interface * other than ARCnet, Ethernet, FDDI and GIF. * * RFC2893 says: * - unidirectional tunnels needs no ND */ switch (ifp->if_type) { case IFT_ARCNET: case IFT_ETHER: case IFT_FDDI: case IFT_IEEE1394: #ifdef IFT_L2VLAN case IFT_L2VLAN: #endif #ifdef IFT_IEEE80211 case IFT_IEEE80211: #endif #ifdef IFT_CARP case IFT_CARP: #endif case IFT_GIF: /* XXX need more cases? */ case IFT_PPP: case IFT_TUNNEL: case IFT_BRIDGE: case IFT_PROPVIRTUAL: return (1); default: return (0); } } int nd6_storelladdr(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m, struct sockaddr *dst, u_char *desten) { struct sockaddr_dl *sdl; struct rtentry *rt; int error; if (m->m_flags & M_MCAST) { int i; switch (ifp->if_type) { case IFT_ETHER: case IFT_FDDI: #ifdef IFT_L2VLAN case IFT_L2VLAN: #endif #ifdef IFT_IEEE80211 case IFT_IEEE80211: #endif case IFT_BRIDGE: case IFT_ISO88025: ETHER_MAP_IPV6_MULTICAST(&SIN6(dst)->sin6_addr, desten); return (0); case IFT_IEEE1394: /* * netbsd can use if_broadcastaddr, but we don't do so * to reduce # of ifdef. */ for (i = 0; i < ifp->if_addrlen; i++) desten[i] = ~0; return (0); case IFT_ARCNET: *desten = 0; return (0); default: m_freem(m); return (EAFNOSUPPORT); } } if (rt0 == NULL) { /* this could happen, if we could not allocate memory */ m_freem(m); return (ENOMEM); } error = rt_check(&rt, &rt0, dst); if (error) { m_freem(m); return (error); } RT_UNLOCK(rt); if (rt->rt_gateway->sa_family != AF_LINK) { printf("nd6_storelladdr: something odd happens\n"); m_freem(m); return (EINVAL); } sdl = SDL(rt->rt_gateway); if (sdl->sdl_alen == 0) { /* this should be impossible, but we bark here for debugging */ printf("nd6_storelladdr: sdl_alen == 0\n"); m_freem(m); return (EINVAL); } bcopy(LLADDR(sdl), desten, sdl->sdl_alen); return (0); } static void clear_llinfo_pqueue(struct llinfo_nd6 *ln) { struct mbuf *m_hold, *m_hold_next; for (m_hold = ln->ln_hold; m_hold; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_hold->m_nextpkt = NULL; m_freem(m_hold); } ln->ln_hold = NULL; return; } static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS); static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS); #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet6_icmp6); #endif SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist, CTLFLAG_RD, nd6_sysctl_drlist, ""); SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, CTLFLAG_RD, nd6_sysctl_prlist, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen, CTLFLAG_RW, nd6_maxqueuelen, 1, ""); static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET6(curvnet); int error; char buf[1024] __aligned(4); struct in6_defrouter *d, *de; struct nd_defrouter *dr; if (req->newptr) return EPERM; error = 0; for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { d = (struct in6_defrouter *)buf; de = (struct in6_defrouter *)(buf + sizeof(buf)); if (d + 1 <= de) { bzero(d, sizeof(*d)); d->rtaddr.sin6_family = AF_INET6; d->rtaddr.sin6_len = sizeof(d->rtaddr); d->rtaddr.sin6_addr = dr->rtaddr; error = sa6_recoverscope(&d->rtaddr); if (error != 0) return (error); d->flags = dr->flags; d->rtlifetime = dr->rtlifetime; d->expire = dr->expire; d->if_index = dr->ifp->if_index; } else panic("buffer too short"); error = SYSCTL_OUT(req, buf, sizeof(*d)); if (error) break; } return (error); } static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET6(curvnet); int error; char buf[1024] __aligned(4); struct in6_prefix *p, *pe; struct nd_prefix *pr; char ip6buf[INET6_ADDRSTRLEN]; if (req->newptr) return EPERM; error = 0; for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) { u_short advrtrs; size_t advance; struct sockaddr_in6 *sin6, *s6; struct nd_pfxrouter *pfr; p = (struct in6_prefix *)buf; pe = (struct in6_prefix *)(buf + sizeof(buf)); if (p + 1 <= pe) { bzero(p, sizeof(*p)); sin6 = (struct sockaddr_in6 *)(p + 1); p->prefix = pr->ndpr_prefix; if (sa6_recoverscope(&p->prefix)) { log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &p->prefix.sin6_addr)); /* XXX: press on... */ } p->raflags = pr->ndpr_raf; p->prefixlen = pr->ndpr_plen; p->vltime = pr->ndpr_vltime; p->pltime = pr->ndpr_pltime; p->if_index = pr->ndpr_ifp->if_index; if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME) p->expire = 0; else { time_t maxexpire; /* XXX: we assume time_t is signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate) { p->expire = pr->ndpr_lastupdate + pr->ndpr_vltime; } else p->expire = maxexpire; } p->refcnt = pr->ndpr_refcnt; p->flags = pr->ndpr_stateflags; p->origin = PR_ORIG_RA; advrtrs = 0; for (pfr = pr->ndpr_advrtrs.lh_first; pfr; pfr = pfr->pfr_next) { if ((void *)&sin6[advrtrs + 1] > (void *)pe) { advrtrs++; continue; } s6 = &sin6[advrtrs]; bzero(s6, sizeof(*s6)); s6->sin6_family = AF_INET6; s6->sin6_len = sizeof(*sin6); s6->sin6_addr = pfr->router->rtaddr; if (sa6_recoverscope(s6)) { log(LOG_ERR, "scope error in " "prefix list (%s)\n", ip6_sprintf(ip6buf, &pfr->router->rtaddr)); } advrtrs++; } p->advrtrs = advrtrs; } else panic("buffer too short"); advance = sizeof(*p) + sizeof(*sin6) * advrtrs; error = SYSCTL_OUT(req, buf, advance); if (error) break; } return (error); } Index: head/sys/netinet6/nd6_nbr.c =================================================================== --- head/sys/netinet6/nd6_nbr.c (revision 185087) +++ head/sys/netinet6/nd6_nbr.c (revision 185088) @@ -1,1500 +1,1504 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6_nbr.c,v 1.86 2002/01/21 02:33:04 jinmei Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_carp.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #ifdef DEV_CARP #include #endif #define SDL(s) ((struct sockaddr_dl *)s) struct dadq; static struct dadq *nd6_dad_find(struct ifaddr *); static void nd6_dad_starttimer(struct dadq *, int); static void nd6_dad_stoptimer(struct dadq *); static void nd6_dad_timer(struct ifaddr *); static void nd6_dad_ns_output(struct dadq *, struct ifaddr *); static void nd6_dad_ns_input(struct ifaddr *); static void nd6_dad_na_input(struct ifaddr *); -static int dad_ignore_ns = 0; /* ignore NS in DAD - specwise incorrect*/ -static int dad_maxtry = 15; /* max # of *tries* to transmit DAD packet */ +#ifdef VIMAGE_GLOBALS +int dad_ignore_ns; +int dad_maxtry; +#endif /* * Input a Neighbor Solicitation Message. * * Based on RFC 2461 * Based on RFC 2462 (duplicate address detection) */ void nd6_ns_input(struct mbuf *m, int off, int icmp6len) { INIT_VNET_INET6(curvnet); struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_neighbor_solicit *nd_ns; struct in6_addr saddr6 = ip6->ip6_src; struct in6_addr daddr6 = ip6->ip6_dst; struct in6_addr taddr6; struct in6_addr myaddr6; char *lladdr = NULL; struct ifaddr *ifa = NULL; int lladdrlen = 0; int anycast = 0, proxy = 0, tentative = 0; int tlladdr; union nd_opts ndopts; struct sockaddr_dl *proxydl = NULL; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_ns, struct nd_neighbor_solicit *, m, off, icmp6len); if (nd_ns == NULL) { V_icmp6stat.icp6s_tooshort++; return; } #endif ip6 = mtod(m, struct ip6_hdr *); /* adjust pointer for safety */ taddr6 = nd_ns->nd_ns_target; if (in6_setscope(&taddr6, ifp, NULL) != 0) goto bad; if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; } if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { /* dst has to be a solicited node multicast address. */ if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL && /* don't check ifindex portion */ daddr6.s6_addr32[1] == 0 && daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE && daddr6.s6_addr8[12] == 0xff) { ; /* good */ } else { nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " "(wrong ip6 dst)\n")); goto bad; } } else if (!nd6_onlink_ns_rfc4861) { struct sockaddr_in6 src_sa6; /* * According to recent IETF discussions, it is not a good idea * to accept a NS from an address which would not be deemed * to be a neighbor otherwise. This point is expected to be * clarified in future revisions of the specification. */ bzero(&src_sa6, sizeof(src_sa6)); src_sa6.sin6_family = AF_INET6; src_sa6.sin6_len = sizeof(src_sa6); src_sa6.sin6_addr = saddr6; if (!nd6_is_addr_neighbor(&src_sa6, ifp)) { nd6log((LOG_INFO, "nd6_ns_input: " "NS packet from non-neighbor\n")); goto bad; } } if (IN6_IS_ADDR_MULTICAST(&taddr6)) { nd6log((LOG_INFO, "nd6_ns_input: bad NS target (multicast)\n")); goto bad; } icmp6len -= sizeof(*nd_ns); nd6_option_init(nd_ns + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_ns_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && lladdr) { nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " "(link-layer address option)\n")); goto bad; } /* * Attaching target link-layer address to the NA? * (RFC 2461 7.2.4) * * NS IP dst is unicast/anycast MUST NOT add * NS IP dst is solicited-node multicast MUST add * * In implementation, we add target link-layer address by default. * We do not add one in MUST NOT cases. */ if (!IN6_IS_ADDR_MULTICAST(&daddr6)) tlladdr = 0; else tlladdr = 1; /* * Target address (taddr6) must be either: * (1) Valid unicast/anycast address for my receiving interface, * (2) Unicast address for which I'm offering proxy service, or * (3) "tentative" address on which DAD is being performed. */ /* (1) and (3) check. */ #ifdef DEV_CARP if (ifp->if_carp) ifa = carp_iamatch6(ifp->if_carp, &taddr6); if (ifa == NULL) ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); #else ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); #endif /* (2) check. */ if (ifa == NULL) { struct rtentry *rt; struct sockaddr_in6 tsin6; int need_proxy; #ifdef RADIX_MPATH struct route_in6 ro; #endif bzero(&tsin6, sizeof tsin6); tsin6.sin6_len = sizeof(struct sockaddr_in6); tsin6.sin6_family = AF_INET6; tsin6.sin6_addr = taddr6; #ifdef RADIX_MPATH bzero(&ro, sizeof(ro)); ro.ro_dst = tsin6; rtalloc_mpath((struct route *)&ro, RTF_ANNOUNCE); rt = ro.ro_rt; #else rt = rtalloc1((struct sockaddr *)&tsin6, 0, 0); #endif need_proxy = (rt && (rt->rt_flags & RTF_ANNOUNCE) != 0 && rt->rt_gateway->sa_family == AF_LINK); if (rt) rtfree(rt); if (need_proxy) { /* * proxy NDP for single entry */ ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); if (ifa) { proxy = 1; proxydl = SDL(rt->rt_gateway); } } } if (ifa == NULL) { /* * We've got an NS packet, and we don't have that adddress * assigned for us. We MUST silently ignore it. * See RFC2461 7.2.3. */ goto freeit; } myaddr6 = *IFA_IN6(ifa); anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST; tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED) goto freeit; if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_ns_input: lladdrlen mismatch for %s " "(if %d, NS packet %d)\n", ip6_sprintf(ip6bufs, &taddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) { nd6log((LOG_INFO, "nd6_ns_input: duplicate IP6 address %s\n", ip6_sprintf(ip6bufs, &saddr6))); goto freeit; } /* * We have neighbor solicitation packet, with target address equals to * one of my tentative address. * * src addr how to process? * --- --- * multicast of course, invalid (rejected in ip6_input) * unicast somebody is doing address resolution -> ignore * unspec dup address detection * * The processing is defined in RFC 2462. */ if (tentative) { /* * If source address is unspecified address, it is for * duplicate address detection. * * If not, the packet is for addess resolution; * silently ignore it. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) nd6_dad_ns_input(ifa); goto freeit; } /* * If the source address is unspecified address, entries must not * be created or updated. * It looks that sender is performing DAD. Output NA toward * all-node multicast address, to tell the sender that I'm using * the address. * S bit ("solicited") must be zero. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { struct in6_addr in6_all; in6_all = in6addr_linklocal_allnodes; if (in6_setscope(&in6_all, ifp, NULL) != 0) goto bad; nd6_na_output(ifp, &in6_all, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | (V_ip6_forwarding ? ND_NA_FLAG_ROUTER : 0), tlladdr, (struct sockaddr *)proxydl); goto freeit; } nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_NEIGHBOR_SOLICIT, 0); nd6_na_output(ifp, &saddr6, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | (V_ip6_forwarding ? ND_NA_FLAG_ROUTER : 0) | ND_NA_FLAG_SOLICITED, tlladdr, (struct sockaddr *)proxydl); freeit: m_freem(m); return; bad: nd6log((LOG_ERR, "nd6_ns_input: src=%s\n", ip6_sprintf(ip6bufs, &saddr6))); nd6log((LOG_ERR, "nd6_ns_input: dst=%s\n", ip6_sprintf(ip6bufs, &daddr6))); nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n", ip6_sprintf(ip6bufs, &taddr6))); V_icmp6stat.icp6s_badns++; m_freem(m); } /* * Output a Neighbor Solicitation Message. Caller specifies: * - ICMP6 header source IP6 address * - ND6 header target IP6 address * - ND6 header source datalink address * * Based on RFC 2461 * Based on RFC 2462 (duplicate address detection) * * ln - for source address determination * dad - duplicate address detection */ void nd6_ns_output(struct ifnet *ifp, const struct in6_addr *daddr6, const struct in6_addr *taddr6, struct llinfo_nd6 *ln, int dad) { INIT_VNET_INET6(ifp->if_vnet); struct mbuf *m; struct ip6_hdr *ip6; struct nd_neighbor_solicit *nd_ns; struct in6_addr *src, src_in; struct ip6_moptions im6o; int icmp6len; int maxlen; caddr_t mac; struct route_in6 ro; bzero(&ro, sizeof(ro)); if (IN6_IS_ADDR_MULTICAST(taddr6)) return; /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_ns); maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7; if (max_linkhdr + maxlen >= MCLBYTES) { #ifdef DIAGNOSTIC printf("nd6_ns_output: max_linkhdr + maxlen >= MCLBYTES " "(%d + %d > %d)\n", max_linkhdr, maxlen, MCLBYTES); #endif return; } MGETHDR(m, M_DONTWAIT, MT_DATA); if (m && max_linkhdr + maxlen >= MHLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); m = NULL; } } if (m == NULL) return; m->m_pkthdr.rcvif = NULL; if (daddr6 == NULL || IN6_IS_ADDR_MULTICAST(daddr6)) { m->m_flags |= M_MCAST; im6o.im6o_multicast_ifp = ifp; im6o.im6o_multicast_hlim = 255; im6o.im6o_multicast_loop = 0; } icmp6len = sizeof(*nd_ns); m->m_pkthdr.len = m->m_len = sizeof(*ip6) + icmp6len; m->m_data += max_linkhdr; /* or MH_ALIGN() equivalent? */ /* fill neighbor solicitation packet */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; if (daddr6) ip6->ip6_dst = *daddr6; else { ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL; ip6->ip6_dst.s6_addr16[1] = 0; ip6->ip6_dst.s6_addr32[1] = 0; ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_ONE; ip6->ip6_dst.s6_addr32[3] = taddr6->s6_addr32[3]; ip6->ip6_dst.s6_addr8[12] = 0xff; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) goto bad; } if (!dad) { /* * RFC2461 7.2.2: * "If the source address of the packet prompting the * solicitation is the same as one of the addresses assigned * to the outgoing interface, that address SHOULD be placed * in the IP Source Address of the outgoing solicitation. * Otherwise, any one of the addresses assigned to the * interface should be used." * * We use the source address for the prompting packet * (saddr6), if: * - saddr6 is given from the caller (by giving "ln"), and * - saddr6 belongs to the outgoing interface. * Otherwise, we perform the source address selection as usual. */ struct ip6_hdr *hip6; /* hold ip6 */ struct in6_addr *hsrc = NULL; if (ln && ln->ln_hold) { /* * assuming every packet in ln_hold has the same IP * header */ hip6 = mtod(ln->ln_hold, struct ip6_hdr *); /* XXX pullup? */ if (sizeof(*hip6) < ln->ln_hold->m_len) hsrc = &hip6->ip6_src; else hsrc = NULL; } if (hsrc && in6ifa_ifpwithaddr(ifp, hsrc)) src = hsrc; else { int error; struct sockaddr_in6 dst_sa; bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; src = in6_selectsrc(&dst_sa, NULL, NULL, &ro, NULL, NULL, &error); if (src == NULL) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "nd6_ns_output: source can't be " "determined: dst=%s, error=%d\n", ip6_sprintf(ip6buf, &dst_sa.sin6_addr), error)); goto bad; } } } else { /* * Source address for DAD packet must always be IPv6 * unspecified address. (0::0) * We actually don't have to 0-clear the address (we did it * above), but we do so here explicitly to make the intention * clearer. */ bzero(&src_in, sizeof(src_in)); src = &src_in; } ip6->ip6_src = *src; nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1); nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT; nd_ns->nd_ns_code = 0; nd_ns->nd_ns_reserved = 0; nd_ns->nd_ns_target = *taddr6; in6_clearscope(&nd_ns->nd_ns_target); /* XXX */ /* * Add source link-layer address option. * * spec implementation * --- --- * DAD packet MUST NOT do not add the option * there's no link layer address: * impossible do not add the option * there's link layer address: * Multicast NS MUST add one add the option * Unicast NS SHOULD add one add the option */ if (!dad && (mac = nd6_ifptomac(ifp))) { int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1); /* 8 byte alignments... */ optlen = (optlen + 7) & ~7; m->m_pkthdr.len += optlen; m->m_len += optlen; icmp6len += optlen; bzero((caddr_t)nd_opt, optlen); nd_opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; nd_opt->nd_opt_len = optlen >> 3; bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen); } ip6->ip6_plen = htons((u_short)icmp6len); nd_ns->nd_ns_cksum = 0; nd_ns->nd_ns_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), icmp6len); ip6_output(m, NULL, &ro, dad ? IPV6_UNSPECSRC : 0, &im6o, NULL, NULL); icmp6_ifstat_inc(ifp, ifs6_out_msg); icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit); V_icmp6stat.icp6s_outhist[ND_NEIGHBOR_SOLICIT]++; if (ro.ro_rt) { /* we don't cache this route. */ RTFREE(ro.ro_rt); } return; bad: if (ro.ro_rt) { RTFREE(ro.ro_rt); } m_freem(m); return; } /* * Neighbor advertisement input handling. * * Based on RFC 2461 * Based on RFC 2462 (duplicate address detection) * * the following items are not implemented yet: * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD) * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD) */ void nd6_na_input(struct mbuf *m, int off, int icmp6len) { INIT_VNET_INET6(curvnet); struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_neighbor_advert *nd_na; struct in6_addr daddr6 = ip6->ip6_dst; struct in6_addr taddr6; int flags; int is_router; int is_solicited; int is_override; char *lladdr = NULL; int lladdrlen = 0; struct ifaddr *ifa; struct llinfo_nd6 *ln; struct rtentry *rt; struct sockaddr_dl *sdl; union nd_opts ndopts; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_na_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_na = (struct nd_neighbor_advert *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_na, struct nd_neighbor_advert *, m, off, icmp6len); if (nd_na == NULL) { V_icmp6stat.icp6s_tooshort++; return; } #endif flags = nd_na->nd_na_flags_reserved; is_router = ((flags & ND_NA_FLAG_ROUTER) != 0); is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0); is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0); taddr6 = nd_na->nd_na_target; if (in6_setscope(&taddr6, ifp, NULL)) goto bad; /* XXX: impossible */ if (IN6_IS_ADDR_MULTICAST(&taddr6)) { nd6log((LOG_ERR, "nd6_na_input: invalid target address %s\n", ip6_sprintf(ip6bufs, &taddr6))); goto bad; } if (IN6_IS_ADDR_MULTICAST(&daddr6)) if (is_solicited) { nd6log((LOG_ERR, "nd6_na_input: a solicited adv is multicasted\n")); goto bad; } icmp6len -= sizeof(*nd_na); nd6_option_init(nd_na + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_na_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); /* * Target address matches one of my interface address. * * If my address is tentative, this means that there's somebody * already using the same address as mine. This indicates DAD failure. * This is defined in RFC 2462. * * Otherwise, process as defined in RFC 2461. */ if (ifa && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE)) { nd6_dad_na_input(ifa); goto freeit; } /* Just for safety, maybe unnecessary. */ if (ifa) { log(LOG_ERR, "nd6_na_input: duplicate IP6 address %s\n", ip6_sprintf(ip6bufs, &taddr6)); goto freeit; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_na_input: lladdrlen mismatch for %s " "(if %d, NA packet %d)\n", ip6_sprintf(ip6bufs, &taddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } /* * If no neighbor cache entry is found, NA SHOULD silently be * discarded. */ rt = nd6_lookup(&taddr6, 0, ifp); if ((rt == NULL) || ((ln = (struct llinfo_nd6 *)rt->rt_llinfo) == NULL) || ((sdl = SDL(rt->rt_gateway)) == NULL)) goto freeit; if (ln->ln_state == ND6_LLINFO_INCOMPLETE) { /* * If the link-layer has address, and no lladdr option came, * discard the packet. */ if (ifp->if_addrlen && lladdr == NULL) goto freeit; /* * Record link-layer address, and update the state. */ sdl->sdl_alen = ifp->if_addrlen; bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen); if (is_solicited) { ln->ln_state = ND6_LLINFO_REACHABLE; ln->ln_byhint = 0; if (!ND6_LLINFO_PERMANENT(ln)) { nd6_llinfo_settimer(ln, (long)ND_IFINFO(rt->rt_ifp)->reachable * hz); } } else { ln->ln_state = ND6_LLINFO_STALE; nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); } if ((ln->ln_router = is_router) != 0) { /* * This means a router's state has changed from * non-reachable to probably reachable, and might * affect the status of associated prefixes.. */ pfxlist_onlink_check(); } } else { int llchange; /* * Check if the link-layer address has changed or not. */ if (lladdr == NULL) llchange = 0; else { if (sdl->sdl_alen) { if (bcmp(lladdr, LLADDR(sdl), ifp->if_addrlen)) llchange = 1; else llchange = 0; } else llchange = 1; } /* * This is VERY complex. Look at it with care. * * override solicit lladdr llchange action * (L: record lladdr) * * 0 0 n -- (2c) * 0 0 y n (2b) L * 0 0 y y (1) REACHABLE->STALE * 0 1 n -- (2c) *->REACHABLE * 0 1 y n (2b) L *->REACHABLE * 0 1 y y (1) REACHABLE->STALE * 1 0 n -- (2a) * 1 0 y n (2a) L * 1 0 y y (2a) L *->STALE * 1 1 n -- (2a) *->REACHABLE * 1 1 y n (2a) L *->REACHABLE * 1 1 y y (2a) L *->REACHABLE */ if (!is_override && (lladdr != NULL && llchange)) { /* (1) */ /* * If state is REACHABLE, make it STALE. * no other updates should be done. */ if (ln->ln_state == ND6_LLINFO_REACHABLE) { ln->ln_state = ND6_LLINFO_STALE; nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); } goto freeit; } else if (is_override /* (2a) */ || (!is_override && (lladdr != NULL && !llchange)) /* (2b) */ || lladdr == NULL) { /* (2c) */ /* * Update link-local address, if any. */ if (lladdr != NULL) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen); } /* * If solicited, make the state REACHABLE. * If not solicited and the link-layer address was * changed, make it STALE. */ if (is_solicited) { ln->ln_state = ND6_LLINFO_REACHABLE; ln->ln_byhint = 0; if (!ND6_LLINFO_PERMANENT(ln)) { nd6_llinfo_settimer(ln, (long)ND_IFINFO(ifp)->reachable * hz); } } else { if (lladdr != NULL && llchange) { ln->ln_state = ND6_LLINFO_STALE; nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); } } } if (ln->ln_router && !is_router) { /* * The peer dropped the router flag. * Remove the sender from the Default Router List and * update the Destination Cache entries. */ struct nd_defrouter *dr; struct in6_addr *in6; int s; in6 = &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr; /* * Lock to protect the default router list. * XXX: this might be unnecessary, since this function * is only called under the network software interrupt * context. However, we keep it just for safety. */ s = splnet(); dr = defrouter_lookup(in6, ifp); if (dr) defrtrlist_del(dr); else if (!V_ip6_forwarding) { /* * Even if the neighbor is not in the default * router list, the neighbor may be used * as a next hop for some destinations * (e.g. redirect case). So we must * call rt6_flush explicitly. */ rt6_flush(&ip6->ip6_src, ifp); } splx(s); } ln->ln_router = is_router; } rt->rt_flags &= ~RTF_REJECT; ln->ln_asked = 0; if (ln->ln_hold) { struct mbuf *m_hold, *m_hold_next; /* * reset the ln_hold in advance, to explicitly * prevent a ln_hold lookup in nd6_output() * (wouldn't happen, though...) */ for (m_hold = ln->ln_hold; m_hold; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_hold->m_nextpkt = NULL; /* * we assume ifp is not a loopback here, so just set * the 2nd argument as the 1st one. */ nd6_output(ifp, ifp, m_hold, (struct sockaddr_in6 *)rt_key(rt), rt); } ln->ln_hold = NULL; } freeit: m_freem(m); return; bad: V_icmp6stat.icp6s_badna++; m_freem(m); } /* * Neighbor advertisement output handling. * * Based on RFC 2461 * * the following items are not implemented yet: * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD) * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD) * * tlladdr - 1 if include target link-layer address * sdl0 - sockaddr_dl (= proxy NA) or NULL */ void nd6_na_output(struct ifnet *ifp, const struct in6_addr *daddr6_0, const struct in6_addr *taddr6, u_long flags, int tlladdr, struct sockaddr *sdl0) { INIT_VNET_INET6(ifp->if_vnet); struct mbuf *m; struct ip6_hdr *ip6; struct nd_neighbor_advert *nd_na; struct ip6_moptions im6o; struct in6_addr *src, daddr6; struct sockaddr_in6 dst_sa; int icmp6len, maxlen, error; caddr_t mac = NULL; struct route_in6 ro; bzero(&ro, sizeof(ro)); daddr6 = *daddr6_0; /* make a local copy for modification */ /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_na); maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7; if (max_linkhdr + maxlen >= MCLBYTES) { #ifdef DIAGNOSTIC printf("nd6_na_output: max_linkhdr + maxlen >= MCLBYTES " "(%d + %d > %d)\n", max_linkhdr, maxlen, MCLBYTES); #endif return; } MGETHDR(m, M_DONTWAIT, MT_DATA); if (m && max_linkhdr + maxlen >= MHLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); m = NULL; } } if (m == NULL) return; m->m_pkthdr.rcvif = NULL; if (IN6_IS_ADDR_MULTICAST(&daddr6)) { m->m_flags |= M_MCAST; im6o.im6o_multicast_ifp = ifp; im6o.im6o_multicast_hlim = 255; im6o.im6o_multicast_loop = 0; } icmp6len = sizeof(*nd_na); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + icmp6len; m->m_data += max_linkhdr; /* or MH_ALIGN() equivalent? */ /* fill neighbor advertisement packet */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; if (IN6_IS_ADDR_UNSPECIFIED(&daddr6)) { /* reply to DAD */ daddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL; daddr6.s6_addr16[1] = 0; daddr6.s6_addr32[1] = 0; daddr6.s6_addr32[2] = 0; daddr6.s6_addr32[3] = IPV6_ADDR_INT32_ONE; if (in6_setscope(&daddr6, ifp, NULL)) goto bad; flags &= ~ND_NA_FLAG_SOLICITED; } ip6->ip6_dst = daddr6; bzero(&dst_sa, sizeof(struct sockaddr_in6)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(struct sockaddr_in6); dst_sa.sin6_addr = daddr6; /* * Select a source whose scope is the same as that of the dest. */ bcopy(&dst_sa, &ro.ro_dst, sizeof(dst_sa)); src = in6_selectsrc(&dst_sa, NULL, NULL, &ro, NULL, NULL, &error); if (src == NULL) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "nd6_na_output: source can't be " "determined: dst=%s, error=%d\n", ip6_sprintf(ip6buf, &dst_sa.sin6_addr), error)); goto bad; } ip6->ip6_src = *src; nd_na = (struct nd_neighbor_advert *)(ip6 + 1); nd_na->nd_na_type = ND_NEIGHBOR_ADVERT; nd_na->nd_na_code = 0; nd_na->nd_na_target = *taddr6; in6_clearscope(&nd_na->nd_na_target); /* XXX */ /* * "tlladdr" indicates NS's condition for adding tlladdr or not. * see nd6_ns_input() for details. * Basically, if NS packet is sent to unicast/anycast addr, * target lladdr option SHOULD NOT be included. */ if (tlladdr) { /* * sdl0 != NULL indicates proxy NA. If we do proxy, use * lladdr in sdl0. If we are not proxying (sending NA for * my address) use lladdr configured for the interface. */ if (sdl0 == NULL) { #ifdef DEV_CARP if (ifp->if_carp) mac = carp_macmatch6(ifp->if_carp, m, taddr6); if (mac == NULL) mac = nd6_ifptomac(ifp); #else mac = nd6_ifptomac(ifp); #endif } else if (sdl0->sa_family == AF_LINK) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)sdl0; if (sdl->sdl_alen == ifp->if_addrlen) mac = LLADDR(sdl); } } if (tlladdr && mac) { int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_na + 1); /* roundup to 8 bytes alignment! */ optlen = (optlen + 7) & ~7; m->m_pkthdr.len += optlen; m->m_len += optlen; icmp6len += optlen; bzero((caddr_t)nd_opt, optlen); nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = optlen >> 3; bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen); } else flags &= ~ND_NA_FLAG_OVERRIDE; ip6->ip6_plen = htons((u_short)icmp6len); nd_na->nd_na_flags_reserved = flags; nd_na->nd_na_cksum = 0; nd_na->nd_na_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), icmp6len); ip6_output(m, NULL, &ro, 0, &im6o, NULL, NULL); icmp6_ifstat_inc(ifp, ifs6_out_msg); icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert); V_icmp6stat.icp6s_outhist[ND_NEIGHBOR_ADVERT]++; if (ro.ro_rt) { /* we don't cache this route. */ RTFREE(ro.ro_rt); } return; bad: if (ro.ro_rt) { RTFREE(ro.ro_rt); } m_freem(m); return; } caddr_t nd6_ifptomac(struct ifnet *ifp) { switch (ifp->if_type) { case IFT_ARCNET: case IFT_ETHER: case IFT_FDDI: case IFT_IEEE1394: #ifdef IFT_L2VLAN case IFT_L2VLAN: #endif #ifdef IFT_IEEE80211 case IFT_IEEE80211: #endif #ifdef IFT_CARP case IFT_CARP: #endif case IFT_BRIDGE: case IFT_ISO88025: return IF_LLADDR(ifp); default: return NULL; } } TAILQ_HEAD(dadq_head, dadq); struct dadq { TAILQ_ENTRY(dadq) dad_list; struct ifaddr *dad_ifa; int dad_count; /* max NS to send */ int dad_ns_tcount; /* # of trials to send NS */ int dad_ns_ocount; /* NS sent so far */ int dad_ns_icount; int dad_na_icount; struct callout dad_timer_ch; }; +#ifdef VIMAGE_GLOBALS static struct dadq_head dadq; -static int dad_init = 0; +int dad_init; +#endif static struct dadq * nd6_dad_find(struct ifaddr *ifa) { INIT_VNET_INET6(curvnet); struct dadq *dp; for (dp = V_dadq.tqh_first; dp; dp = dp->dad_list.tqe_next) { if (dp->dad_ifa == ifa) return dp; } return NULL; } static void nd6_dad_starttimer(struct dadq *dp, int ticks) { callout_reset(&dp->dad_timer_ch, ticks, (void (*)(void *))nd6_dad_timer, (void *)dp->dad_ifa); } static void nd6_dad_stoptimer(struct dadq *dp) { callout_stop(&dp->dad_timer_ch); } /* * Start Duplicate Address Detection (DAD) for specified interface address. */ void nd6_dad_start(struct ifaddr *ifa, int delay) { INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; char ip6buf[INET6_ADDRSTRLEN]; if (!V_dad_init) { TAILQ_INIT(&V_dadq); V_dad_init++; } /* * If we don't need DAD, don't do it. * There are several cases: * - DAD is disabled (ip6_dad_count == 0) * - the interface address is anycast */ if (!(ia->ia6_flags & IN6_IFF_TENTATIVE)) { log(LOG_DEBUG, "nd6_dad_start: called with non-tentative address " "%s(%s)\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); return; } if (ia->ia6_flags & IN6_IFF_ANYCAST) { ia->ia6_flags &= ~IN6_IFF_TENTATIVE; return; } if (!V_ip6_dad_count) { ia->ia6_flags &= ~IN6_IFF_TENTATIVE; return; } if (ifa->ifa_ifp == NULL) panic("nd6_dad_start: ifa->ifa_ifp == NULL"); if (!(ifa->ifa_ifp->if_flags & IFF_UP)) { return; } if (nd6_dad_find(ifa) != NULL) { /* DAD already in progress */ return; } dp = malloc(sizeof(*dp), M_IP6NDP, M_NOWAIT); if (dp == NULL) { log(LOG_ERR, "nd6_dad_start: memory allocation failed for " "%s(%s)\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); return; } bzero(dp, sizeof(*dp)); callout_init(&dp->dad_timer_ch, 0); TAILQ_INSERT_TAIL(&V_dadq, (struct dadq *)dp, dad_list); nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); /* * Send NS packet for DAD, ip6_dad_count times. * Note that we must delay the first transmission, if this is the * first packet to be sent from the interface after interface * (re)initialization. */ dp->dad_ifa = ifa; IFAREF(ifa); /* just for safety */ dp->dad_count = V_ip6_dad_count; dp->dad_ns_icount = dp->dad_na_icount = 0; dp->dad_ns_ocount = dp->dad_ns_tcount = 0; if (delay == 0) { nd6_dad_ns_output(dp, ifa); nd6_dad_starttimer(dp, (long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000); } else { nd6_dad_starttimer(dp, delay); } } /* * terminate DAD unconditionally. used for address removals. */ void nd6_dad_stop(struct ifaddr *ifa) { INIT_VNET_INET6(curvnet); struct dadq *dp; if (!V_dad_init) return; dp = nd6_dad_find(ifa); if (!dp) { /* DAD wasn't started yet */ return; } nd6_dad_stoptimer(dp); TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list); free(dp, M_IP6NDP); dp = NULL; IFAFREE(ifa); } static void nd6_dad_timer(struct ifaddr *ifa) { CURVNET_SET(dp->dad_vnet); INIT_VNET_INET6(curvnet); int s; struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; char ip6buf[INET6_ADDRSTRLEN]; s = splnet(); /* XXX */ /* Sanity check */ if (ia == NULL) { log(LOG_ERR, "nd6_dad_timer: called with null parameter\n"); goto done; } dp = nd6_dad_find(ifa); if (dp == NULL) { log(LOG_ERR, "nd6_dad_timer: DAD structure not found\n"); goto done; } if (ia->ia6_flags & IN6_IFF_DUPLICATED) { log(LOG_ERR, "nd6_dad_timer: called with duplicated address " "%s(%s)\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); goto done; } if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) { log(LOG_ERR, "nd6_dad_timer: called with non-tentative address " "%s(%s)\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); goto done; } /* timeouted with IFF_{RUNNING,UP} check */ if (dp->dad_ns_tcount > V_dad_maxtry) { nd6log((LOG_INFO, "%s: could not run DAD, driver problem?\n", if_name(ifa->ifa_ifp))); TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list); free(dp, M_IP6NDP); dp = NULL; IFAFREE(ifa); goto done; } /* Need more checks? */ if (dp->dad_ns_ocount < dp->dad_count) { /* * We have more NS to go. Send NS packet for DAD. */ nd6_dad_ns_output(dp, ifa); nd6_dad_starttimer(dp, (long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000); } else { /* * We have transmitted sufficient number of DAD packets. * See what we've got. */ int duplicate; duplicate = 0; if (dp->dad_na_icount) { /* * the check is in nd6_dad_na_input(), * but just in case */ duplicate++; } if (dp->dad_ns_icount) { /* We've seen NS, means DAD has failed. */ duplicate++; } if (duplicate) { /* (*dp) will be freed in nd6_dad_duplicated() */ dp = NULL; nd6_dad_duplicated(ifa); } else { /* * We are done with DAD. No NA came, no NS came. * No duplicate address found. */ ia->ia6_flags &= ~IN6_IFF_TENTATIVE; nd6log((LOG_DEBUG, "%s: DAD complete for %s - no duplicates found\n", if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list); free(dp, M_IP6NDP); dp = NULL; IFAFREE(ifa); } } done: splx(s); CURVNET_RESTORE(); } void nd6_dad_duplicated(struct ifaddr *ifa) { INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct ifnet *ifp; struct dadq *dp; char ip6buf[INET6_ADDRSTRLEN]; dp = nd6_dad_find(ifa); if (dp == NULL) { log(LOG_ERR, "nd6_dad_duplicated: DAD structure not found\n"); return; } log(LOG_ERR, "%s: DAD detected duplicate IPv6 address %s: " "NS in/out=%d/%d, NA in=%d\n", if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr), dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_na_icount); ia->ia6_flags &= ~IN6_IFF_TENTATIVE; ia->ia6_flags |= IN6_IFF_DUPLICATED; /* We are done with DAD, with duplicate address found. (failure) */ nd6_dad_stoptimer(dp); ifp = ifa->ifa_ifp; log(LOG_ERR, "%s: DAD complete for %s - duplicate found\n", if_name(ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)); log(LOG_ERR, "%s: manual intervention required\n", if_name(ifp)); /* * If the address is a link-local address formed from an interface * identifier based on the hardware address which is supposed to be * uniquely assigned (e.g., EUI-64 for an Ethernet interface), IP * operation on the interface SHOULD be disabled. * [rfc2462bis-03 Section 5.4.5] */ if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) { struct in6_addr in6; /* * To avoid over-reaction, we only apply this logic when we are * very sure that hardware addresses are supposed to be unique. */ switch (ifp->if_type) { case IFT_ETHER: case IFT_FDDI: case IFT_ATM: case IFT_IEEE1394: #ifdef IFT_IEEE80211 case IFT_IEEE80211: #endif in6 = ia->ia_addr.sin6_addr; if (in6_get_hw_ifid(ifp, &in6) == 0 && IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) { ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED; log(LOG_ERR, "%s: possible hardware address " "duplication detected, disable IPv6\n", if_name(ifp)); } break; } } TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list); free(dp, M_IP6NDP); dp = NULL; IFAFREE(ifa); } static void nd6_dad_ns_output(struct dadq *dp, struct ifaddr *ifa) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct ifnet *ifp = ifa->ifa_ifp; dp->dad_ns_tcount++; if ((ifp->if_flags & IFF_UP) == 0) { return; } if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { return; } dp->dad_ns_ocount++; nd6_ns_output(ifp, NULL, &ia->ia_addr.sin6_addr, NULL, 1); } static void nd6_dad_ns_input(struct ifaddr *ifa) { INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia; struct ifnet *ifp; const struct in6_addr *taddr6; struct dadq *dp; int duplicate; if (ifa == NULL) panic("ifa == NULL in nd6_dad_ns_input"); ia = (struct in6_ifaddr *)ifa; ifp = ifa->ifa_ifp; taddr6 = &ia->ia_addr.sin6_addr; duplicate = 0; dp = nd6_dad_find(ifa); /* Quickhack - completely ignore DAD NS packets */ if (V_dad_ignore_ns) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_INFO, "nd6_dad_ns_input: ignoring DAD NS packet for " "address %s(%s)\n", ip6_sprintf(ip6buf, taddr6), if_name(ifa->ifa_ifp))); return; } /* * if I'm yet to start DAD, someone else started using this address * first. I have a duplicate and you win. */ if (dp == NULL || dp->dad_ns_ocount == 0) duplicate++; /* XXX more checks for loopback situation - see nd6_dad_timer too */ if (duplicate) { dp = NULL; /* will be freed in nd6_dad_duplicated() */ nd6_dad_duplicated(ifa); } else { /* * not sure if I got a duplicate. * increment ns count and see what happens. */ if (dp) dp->dad_ns_icount++; } } static void nd6_dad_na_input(struct ifaddr *ifa) { struct dadq *dp; if (ifa == NULL) panic("ifa == NULL in nd6_dad_na_input"); dp = nd6_dad_find(ifa); if (dp) dp->dad_na_icount++; /* remove the address. */ nd6_dad_duplicated(ifa); } Index: head/sys/netinet6/nd6_rtr.c =================================================================== --- head/sys/netinet6/nd6_rtr.c (revision 185087) +++ head/sys/netinet6/nd6_rtr.c (revision 185088) @@ -1,2118 +1,2114 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6_rtr.c,v 1.111 2001/04/27 01:37:15 jinmei Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define SDL(s) ((struct sockaddr_dl *)s) static int rtpref(struct nd_defrouter *); static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *); static int prelist_update __P((struct nd_prefixctl *, struct nd_defrouter *, struct mbuf *, int)); static struct in6_ifaddr *in6_ifadd(struct nd_prefixctl *, int); static struct nd_pfxrouter *pfxrtr_lookup __P((struct nd_prefix *, struct nd_defrouter *)); static void pfxrtr_add(struct nd_prefix *, struct nd_defrouter *); static void pfxrtr_del(struct nd_pfxrouter *); static struct nd_pfxrouter *find_pfxlist_reachable_router (struct nd_prefix *); static void defrouter_delreq(struct nd_defrouter *); static void nd6_rtmsg(int, struct rtentry *); static int in6_init_prefix_ltimes(struct nd_prefix *); static void in6_init_address_ltimes __P((struct nd_prefix *, struct in6_addrlifetime *)); static int rt6_deleteroute(struct radix_node *, void *); extern int nd6_recalc_reachtm_interval; +#ifdef VIMAGE_GLOBALS static struct ifnet *nd6_defifp; int nd6_defifindex; -int ip6_use_tempaddr = 0; - +int ip6_use_tempaddr; int ip6_desync_factor; -u_int32_t ip6_temp_preferred_lifetime = DEF_TEMP_PREFERRED_LIFETIME; -u_int32_t ip6_temp_valid_lifetime = DEF_TEMP_VALID_LIFETIME; -/* - * shorter lifetimes for debugging purposes. -int ip6_temp_preferred_lifetime = 800; -static int ip6_temp_valid_lifetime = 1800; -*/ -int ip6_temp_regen_advance = TEMPADDR_REGEN_ADVANCE; +u_int32_t ip6_temp_preferred_lifetime; +u_int32_t ip6_temp_valid_lifetime; +int ip6_temp_regen_advance; +#endif /* RTPREF_MEDIUM has to be 0! */ #define RTPREF_HIGH 1 #define RTPREF_MEDIUM 0 #define RTPREF_LOW (-1) #define RTPREF_RESERVED (-2) #define RTPREF_INVALID (-3) /* internal */ /* * Receive Router Solicitation Message - just for routers. * Router solicitation/advertisement is mostly managed by userland program * (rtadvd) so here we have no function like nd6_ra_output(). * * Based on RFC 2461 */ void nd6_rs_input(struct mbuf *m, int off, int icmp6len) { INIT_VNET_INET6(curvnet); struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_router_solicit *nd_rs; struct in6_addr saddr6 = ip6->ip6_src; char *lladdr = NULL; int lladdrlen = 0; union nd_opts ndopts; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; /* If I'm not a router, ignore it. */ if (V_ip6_accept_rtadv != 0 || V_ip6_forwarding != 1) goto freeit; /* Sanity checks */ if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_rs_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; } /* * Don't update the neighbor cache, if src = ::. * This indicates that the src has no IP address assigned yet. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) goto freeit; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_rs = (struct nd_router_solicit *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_rs, struct nd_router_solicit *, m, off, icmp6len); if (nd_rs == NULL) { V_icmp6stat.icp6s_tooshort++; return; } #endif icmp6len -= sizeof(*nd_rs); nd6_option_init(nd_rs + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_rs_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_rs_input: lladdrlen mismatch for %s " "(if %d, RS packet %d)\n", ip6_sprintf(ip6bufs, &saddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_ROUTER_SOLICIT, 0); freeit: m_freem(m); return; bad: V_icmp6stat.icp6s_badrs++; m_freem(m); } /* * Receive Router Advertisement Message. * * Based on RFC 2461 * TODO: on-link bit on prefix information * TODO: ND_RA_FLAG_{OTHER,MANAGED} processing */ void nd6_ra_input(struct mbuf *m, int off, int icmp6len) { INIT_VNET_INET6(curvnet); struct ifnet *ifp = m->m_pkthdr.rcvif; struct nd_ifinfo *ndi = ND_IFINFO(ifp); struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_router_advert *nd_ra; struct in6_addr saddr6 = ip6->ip6_src; int mcast = 0; union nd_opts ndopts; struct nd_defrouter *dr; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; /* * We only accept RAs only when * the system-wide variable allows the acceptance, and * per-interface variable allows RAs on the receiving interface. */ if (V_ip6_accept_rtadv == 0) goto freeit; if (!(ndi->flags & ND6_IFF_ACCEPT_RTADV)) goto freeit; if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_ra_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; } if (!IN6_IS_ADDR_LINKLOCAL(&saddr6)) { nd6log((LOG_ERR, "nd6_ra_input: src %s is not link-local\n", ip6_sprintf(ip6bufs, &saddr6))); goto bad; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_ra = (struct nd_router_advert *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_ra, struct nd_router_advert *, m, off, icmp6len); if (nd_ra == NULL) { V_icmp6stat.icp6s_tooshort++; return; } #endif icmp6len -= sizeof(*nd_ra); nd6_option_init(nd_ra + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_ra_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } { struct nd_defrouter dr0; u_int32_t advreachable = nd_ra->nd_ra_reachable; /* remember if this is a multicasted advertisement */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) mcast = 1; bzero(&dr0, sizeof(dr0)); dr0.rtaddr = saddr6; dr0.flags = nd_ra->nd_ra_flags_reserved; dr0.rtlifetime = ntohs(nd_ra->nd_ra_router_lifetime); dr0.expire = time_second + dr0.rtlifetime; dr0.ifp = ifp; /* unspecified or not? (RFC 2461 6.3.4) */ if (advreachable) { advreachable = ntohl(advreachable); if (advreachable <= MAX_REACHABLE_TIME && ndi->basereachable != advreachable) { ndi->basereachable = advreachable; ndi->reachable = ND_COMPUTE_RTIME(ndi->basereachable); ndi->recalctm = V_nd6_recalc_reachtm_interval; /* reset */ } } if (nd_ra->nd_ra_retransmit) ndi->retrans = ntohl(nd_ra->nd_ra_retransmit); if (nd_ra->nd_ra_curhoplimit) ndi->chlim = nd_ra->nd_ra_curhoplimit; dr = defrtrlist_update(&dr0); } /* * prefix */ if (ndopts.nd_opts_pi) { struct nd_opt_hdr *pt; struct nd_opt_prefix_info *pi = NULL; struct nd_prefixctl pr; for (pt = (struct nd_opt_hdr *)ndopts.nd_opts_pi; pt <= (struct nd_opt_hdr *)ndopts.nd_opts_pi_end; pt = (struct nd_opt_hdr *)((caddr_t)pt + (pt->nd_opt_len << 3))) { if (pt->nd_opt_type != ND_OPT_PREFIX_INFORMATION) continue; pi = (struct nd_opt_prefix_info *)pt; if (pi->nd_opt_pi_len != 4) { nd6log((LOG_INFO, "nd6_ra_input: invalid option " "len %d for prefix information option, " "ignored\n", pi->nd_opt_pi_len)); continue; } if (128 < pi->nd_opt_pi_prefix_len) { nd6log((LOG_INFO, "nd6_ra_input: invalid prefix " "len %d for prefix information option, " "ignored\n", pi->nd_opt_pi_prefix_len)); continue; } if (IN6_IS_ADDR_MULTICAST(&pi->nd_opt_pi_prefix) || IN6_IS_ADDR_LINKLOCAL(&pi->nd_opt_pi_prefix)) { nd6log((LOG_INFO, "nd6_ra_input: invalid prefix " "%s, ignored\n", ip6_sprintf(ip6bufs, &pi->nd_opt_pi_prefix))); continue; } bzero(&pr, sizeof(pr)); pr.ndpr_prefix.sin6_family = AF_INET6; pr.ndpr_prefix.sin6_len = sizeof(pr.ndpr_prefix); pr.ndpr_prefix.sin6_addr = pi->nd_opt_pi_prefix; pr.ndpr_ifp = (struct ifnet *)m->m_pkthdr.rcvif; pr.ndpr_raf_onlink = (pi->nd_opt_pi_flags_reserved & ND_OPT_PI_FLAG_ONLINK) ? 1 : 0; pr.ndpr_raf_auto = (pi->nd_opt_pi_flags_reserved & ND_OPT_PI_FLAG_AUTO) ? 1 : 0; pr.ndpr_plen = pi->nd_opt_pi_prefix_len; pr.ndpr_vltime = ntohl(pi->nd_opt_pi_valid_time); pr.ndpr_pltime = ntohl(pi->nd_opt_pi_preferred_time); (void)prelist_update(&pr, dr, m, mcast); } } /* * MTU */ if (ndopts.nd_opts_mtu && ndopts.nd_opts_mtu->nd_opt_mtu_len == 1) { u_long mtu; u_long maxmtu; mtu = (u_long)ntohl(ndopts.nd_opts_mtu->nd_opt_mtu_mtu); /* lower bound */ if (mtu < IPV6_MMTU) { nd6log((LOG_INFO, "nd6_ra_input: bogus mtu option " "mtu=%lu sent from %s, ignoring\n", mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src))); goto skip; } /* upper bound */ maxmtu = (ndi->maxmtu && ndi->maxmtu < ifp->if_mtu) ? ndi->maxmtu : ifp->if_mtu; if (mtu <= maxmtu) { int change = (ndi->linkmtu != mtu); ndi->linkmtu = mtu; if (change) /* in6_maxmtu may change */ in6_setmaxmtu(); } else { nd6log((LOG_INFO, "nd6_ra_input: bogus mtu " "mtu=%lu sent from %s; " "exceeds maxmtu %lu, ignoring\n", mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src), maxmtu)); } } skip: /* * Source link layer address */ { char *lladdr = NULL; int lladdrlen = 0; if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_ra_input: lladdrlen mismatch for %s " "(if %d, RA packet %d)\n", ip6_sprintf(ip6bufs, &saddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_ROUTER_ADVERT, 0); /* * Installing a link-layer address might change the state of the * router's neighbor cache, which might also affect our on-link * detection of adveritsed prefixes. */ pfxlist_onlink_check(); } freeit: m_freem(m); return; bad: V_icmp6stat.icp6s_badra++; m_freem(m); } /* * default router list proccessing sub routines */ /* tell the change to user processes watching the routing socket. */ static void nd6_rtmsg(int cmd, struct rtentry *rt) { struct rt_addrinfo info; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); if (rt->rt_ifp) { info.rti_info[RTAX_IFP] = TAILQ_FIRST(&rt->rt_ifp->if_addrlist)->ifa_addr; info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; } rt_missmsg(cmd, &info, rt->rt_flags, 0); } void defrouter_addreq(struct nd_defrouter *new) { struct sockaddr_in6 def, mask, gate; struct rtentry *newrt = NULL; int s; int error; bzero(&def, sizeof(def)); bzero(&mask, sizeof(mask)); bzero(&gate, sizeof(gate)); def.sin6_len = mask.sin6_len = gate.sin6_len = sizeof(struct sockaddr_in6); def.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = new->rtaddr; s = splnet(); error = rtrequest(RTM_ADD, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, RTF_GATEWAY, &newrt); if (newrt) { RT_LOCK(newrt); nd6_rtmsg(RTM_ADD, newrt); /* tell user process */ RT_REMREF(newrt); RT_UNLOCK(newrt); } if (error == 0) new->installed = 1; splx(s); return; } struct nd_defrouter * defrouter_lookup(struct in6_addr *addr, struct ifnet *ifp) { INIT_VNET_INET6(ifp->if_vnet); struct nd_defrouter *dr; for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr)) return (dr); } return (NULL); /* search failed */ } /* * Remove the default route for a given router. * This is just a subroutine function for defrouter_select(), and should * not be called from anywhere else. */ static void defrouter_delreq(struct nd_defrouter *dr) { struct sockaddr_in6 def, mask, gate; struct rtentry *oldrt = NULL; bzero(&def, sizeof(def)); bzero(&mask, sizeof(mask)); bzero(&gate, sizeof(gate)); def.sin6_len = mask.sin6_len = gate.sin6_len = sizeof(struct sockaddr_in6); def.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = dr->rtaddr; rtrequest(RTM_DELETE, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, RTF_GATEWAY, &oldrt); if (oldrt) { nd6_rtmsg(RTM_DELETE, oldrt); RTFREE(oldrt); } dr->installed = 0; } /* * remove all default routes from default router list */ void defrouter_reset(void) { INIT_VNET_INET6(curvnet); struct nd_defrouter *dr; for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) defrouter_delreq(dr); /* * XXX should we also nuke any default routers in the kernel, by * going through them by rtalloc1()? */ } void defrtrlist_del(struct nd_defrouter *dr) { INIT_VNET_INET6(curvnet); struct nd_defrouter *deldr = NULL; struct nd_prefix *pr; /* * Flush all the routing table entries that use the router * as a next hop. */ if (!V_ip6_forwarding && V_ip6_accept_rtadv) /* XXX: better condition? */ rt6_flush(&dr->rtaddr, dr->ifp); if (dr->installed) { deldr = dr; defrouter_delreq(dr); } TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry); /* * Also delete all the pointers to the router in each prefix lists. */ for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) { struct nd_pfxrouter *pfxrtr; if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL) pfxrtr_del(pfxrtr); } pfxlist_onlink_check(); /* * If the router is the primary one, choose a new one. * Note that defrouter_select() will remove the current gateway * from the routing table. */ if (deldr) defrouter_select(); free(dr, M_IP6NDP); } /* * Default Router Selection according to Section 6.3.6 of RFC 2461 and * draft-ietf-ipngwg-router-selection: * 1) Routers that are reachable or probably reachable should be preferred. * If we have more than one (probably) reachable router, prefer ones * with the highest router preference. * 2) When no routers on the list are known to be reachable or * probably reachable, routers SHOULD be selected in a round-robin * fashion, regardless of router preference values. * 3) If the Default Router List is empty, assume that all * destinations are on-link. * * We assume nd_defrouter is sorted by router preference value. * Since the code below covers both with and without router preference cases, * we do not need to classify the cases by ifdef. * * At this moment, we do not try to install more than one default router, * even when the multipath routing is available, because we're not sure about * the benefits for stub hosts comparing to the risk of making the code * complicated and the possibility of introducing bugs. */ void defrouter_select(void) { INIT_VNET_INET6(curvnet); int s = splnet(); struct nd_defrouter *dr, *selected_dr = NULL, *installed_dr = NULL; struct rtentry *rt = NULL; struct llinfo_nd6 *ln = NULL; /* * This function should be called only when acting as an autoconfigured * host. Although the remaining part of this function is not effective * if the node is not an autoconfigured host, we explicitly exclude * such cases here for safety. */ if (V_ip6_forwarding || !V_ip6_accept_rtadv) { nd6log((LOG_WARNING, "defrouter_select: called unexpectedly (forwarding=%d, " "accept_rtadv=%d)\n", V_ip6_forwarding, V_ip6_accept_rtadv)); splx(s); return; } /* * Let's handle easy case (3) first: * If default router list is empty, there's nothing to be done. */ if (!TAILQ_FIRST(&V_nd_defrouter)) { splx(s); return; } /* * Search for a (probably) reachable router from the list. * We just pick up the first reachable one (if any), assuming that * the ordering rule of the list described in defrtrlist_update(). */ for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { if (selected_dr == NULL && (rt = nd6_lookup(&dr->rtaddr, 0, dr->ifp)) && (ln = (struct llinfo_nd6 *)rt->rt_llinfo) && ND6_IS_LLINFO_PROBREACH(ln)) { selected_dr = dr; } if (dr->installed && installed_dr == NULL) installed_dr = dr; else if (dr->installed && installed_dr) { /* this should not happen. warn for diagnosis. */ log(LOG_ERR, "defrouter_select: more than one router" " is installed\n"); } } /* * If none of the default routers was found to be reachable, * round-robin the list regardless of preference. * Otherwise, if we have an installed router, check if the selected * (reachable) router should really be preferred to the installed one. * We only prefer the new router when the old one is not reachable * or when the new one has a really higher preference value. */ if (selected_dr == NULL) { if (installed_dr == NULL || !TAILQ_NEXT(installed_dr, dr_entry)) selected_dr = TAILQ_FIRST(&V_nd_defrouter); else selected_dr = TAILQ_NEXT(installed_dr, dr_entry); } else if (installed_dr && (rt = nd6_lookup(&installed_dr->rtaddr, 0, installed_dr->ifp)) && (ln = (struct llinfo_nd6 *)rt->rt_llinfo) && ND6_IS_LLINFO_PROBREACH(ln) && rtpref(selected_dr) <= rtpref(installed_dr)) { selected_dr = installed_dr; } /* * If the selected router is different than the installed one, * remove the installed router and install the selected one. * Note that the selected router is never NULL here. */ if (installed_dr != selected_dr) { if (installed_dr) defrouter_delreq(installed_dr); defrouter_addreq(selected_dr); } splx(s); return; } /* * for default router selection * regards router-preference field as a 2-bit signed integer */ static int rtpref(struct nd_defrouter *dr) { switch (dr->flags & ND_RA_FLAG_RTPREF_MASK) { case ND_RA_FLAG_RTPREF_HIGH: return (RTPREF_HIGH); case ND_RA_FLAG_RTPREF_MEDIUM: case ND_RA_FLAG_RTPREF_RSV: return (RTPREF_MEDIUM); case ND_RA_FLAG_RTPREF_LOW: return (RTPREF_LOW); default: /* * This case should never happen. If it did, it would mean a * serious bug of kernel internal. We thus always bark here. * Or, can we even panic? */ log(LOG_ERR, "rtpref: impossible RA flag %x\n", dr->flags); return (RTPREF_INVALID); } /* NOTREACHED */ } static struct nd_defrouter * defrtrlist_update(struct nd_defrouter *new) { INIT_VNET_INET6(curvnet); struct nd_defrouter *dr, *n; int s = splnet(); if ((dr = defrouter_lookup(&new->rtaddr, new->ifp)) != NULL) { /* entry exists */ if (new->rtlifetime == 0) { defrtrlist_del(dr); dr = NULL; } else { int oldpref = rtpref(dr); /* override */ dr->flags = new->flags; /* xxx flag check */ dr->rtlifetime = new->rtlifetime; dr->expire = new->expire; /* * If the preference does not change, there's no need * to sort the entries. */ if (rtpref(new) == oldpref) { splx(s); return (dr); } /* * preferred router may be changed, so relocate * this router. * XXX: calling TAILQ_REMOVE directly is a bad manner. * However, since defrtrlist_del() has many side * effects, we intentionally do so here. * defrouter_select() below will handle routing * changes later. */ TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry); n = dr; goto insert; } splx(s); return (dr); } /* entry does not exist */ if (new->rtlifetime == 0) { splx(s); return (NULL); } n = (struct nd_defrouter *)malloc(sizeof(*n), M_IP6NDP, M_NOWAIT); if (n == NULL) { splx(s); return (NULL); } bzero(n, sizeof(*n)); *n = *new; insert: /* * Insert the new router in the Default Router List; * The Default Router List should be in the descending order * of router-preferece. Routers with the same preference are * sorted in the arriving time order. */ /* insert at the end of the group */ for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { if (rtpref(n) > rtpref(dr)) break; } if (dr) TAILQ_INSERT_BEFORE(dr, n, dr_entry); else TAILQ_INSERT_TAIL(&V_nd_defrouter, n, dr_entry); defrouter_select(); splx(s); return (n); } static struct nd_pfxrouter * pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr) { struct nd_pfxrouter *search; for (search = pr->ndpr_advrtrs.lh_first; search; search = search->pfr_next) { if (search->router == dr) break; } return (search); } static void pfxrtr_add(struct nd_prefix *pr, struct nd_defrouter *dr) { struct nd_pfxrouter *new; new = (struct nd_pfxrouter *)malloc(sizeof(*new), M_IP6NDP, M_NOWAIT); if (new == NULL) return; bzero(new, sizeof(*new)); new->router = dr; LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry); pfxlist_onlink_check(); } static void pfxrtr_del(struct nd_pfxrouter *pfr) { LIST_REMOVE(pfr, pfr_entry); free(pfr, M_IP6NDP); } struct nd_prefix * nd6_prefix_lookup(struct nd_prefixctl *key) { INIT_VNET_INET6(curvnet); struct nd_prefix *search; for (search = V_nd_prefix.lh_first; search; search = search->ndpr_next) { if (key->ndpr_ifp == search->ndpr_ifp && key->ndpr_plen == search->ndpr_plen && in6_are_prefix_equal(&key->ndpr_prefix.sin6_addr, &search->ndpr_prefix.sin6_addr, key->ndpr_plen)) { break; } } return (search); } int nd6_prelist_add(struct nd_prefixctl *pr, struct nd_defrouter *dr, struct nd_prefix **newp) { INIT_VNET_INET6(curvnet); struct nd_prefix *new = NULL; int error = 0; int i, s; char ip6buf[INET6_ADDRSTRLEN]; new = (struct nd_prefix *)malloc(sizeof(*new), M_IP6NDP, M_NOWAIT); if (new == NULL) return(ENOMEM); bzero(new, sizeof(*new)); new->ndpr_ifp = pr->ndpr_ifp; new->ndpr_prefix = pr->ndpr_prefix; new->ndpr_plen = pr->ndpr_plen; new->ndpr_vltime = pr->ndpr_vltime; new->ndpr_pltime = pr->ndpr_pltime; new->ndpr_flags = pr->ndpr_flags; if ((error = in6_init_prefix_ltimes(new)) != 0) { free(new, M_IP6NDP); return(error); } new->ndpr_lastupdate = time_second; if (newp != NULL) *newp = new; /* initialization */ LIST_INIT(&new->ndpr_advrtrs); in6_prefixlen2mask(&new->ndpr_mask, new->ndpr_plen); /* make prefix in the canonical form */ for (i = 0; i < 4; i++) new->ndpr_prefix.sin6_addr.s6_addr32[i] &= new->ndpr_mask.s6_addr32[i]; s = splnet(); /* link ndpr_entry to nd_prefix list */ LIST_INSERT_HEAD(&V_nd_prefix, new, ndpr_entry); splx(s); /* ND_OPT_PI_FLAG_ONLINK processing */ if (new->ndpr_raf_onlink) { int e; if ((e = nd6_prefix_onlink(new)) != 0) { nd6log((LOG_ERR, "nd6_prelist_add: failed to make " "the prefix %s/%d on-link on %s (errno=%d)\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); /* proceed anyway. XXX: is it correct? */ } } if (dr) pfxrtr_add(new, dr); return 0; } void prelist_remove(struct nd_prefix *pr) { INIT_VNET_INET6(curvnet); struct nd_pfxrouter *pfr, *next; int e, s; char ip6buf[INET6_ADDRSTRLEN]; /* make sure to invalidate the prefix until it is really freed. */ pr->ndpr_vltime = 0; pr->ndpr_pltime = 0; /* * Though these flags are now meaningless, we'd rather keep the value * of pr->ndpr_raf_onlink and pr->ndpr_raf_auto not to confuse users * when executing "ndp -p". */ if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0 && (e = nd6_prefix_offlink(pr)) != 0) { nd6log((LOG_ERR, "prelist_remove: failed to make %s/%d offlink " "on %s, errno=%d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); /* what should we do? */ } if (pr->ndpr_refcnt > 0) return; /* notice here? */ s = splnet(); /* unlink ndpr_entry from nd_prefix list */ LIST_REMOVE(pr, ndpr_entry); /* free list of routers that adversed the prefix */ for (pfr = pr->ndpr_advrtrs.lh_first; pfr; pfr = next) { next = pfr->pfr_next; free(pfr, M_IP6NDP); } splx(s); free(pr, M_IP6NDP); pfxlist_onlink_check(); } /* * dr - may be NULL */ static int prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr, struct mbuf *m, int mcast) { INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia6 = NULL, *ia6_match = NULL; struct ifaddr *ifa; struct ifnet *ifp = new->ndpr_ifp; struct nd_prefix *pr; int s = splnet(); int error = 0; int newprefix = 0; int auth; struct in6_addrlifetime lt6_tmp; char ip6buf[INET6_ADDRSTRLEN]; auth = 0; if (m) { /* * Authenticity for NA consists authentication for * both IP header and IP datagrams, doesn't it ? */ #if defined(M_AUTHIPHDR) && defined(M_AUTHIPDGM) auth = ((m->m_flags & M_AUTHIPHDR) && (m->m_flags & M_AUTHIPDGM)); #endif } if ((pr = nd6_prefix_lookup(new)) != NULL) { /* * nd6_prefix_lookup() ensures that pr and new have the same * prefix on a same interface. */ /* * Update prefix information. Note that the on-link (L) bit * and the autonomous (A) bit should NOT be changed from 1 * to 0. */ if (new->ndpr_raf_onlink == 1) pr->ndpr_raf_onlink = 1; if (new->ndpr_raf_auto == 1) pr->ndpr_raf_auto = 1; if (new->ndpr_raf_onlink) { pr->ndpr_vltime = new->ndpr_vltime; pr->ndpr_pltime = new->ndpr_pltime; (void)in6_init_prefix_ltimes(pr); /* XXX error case? */ pr->ndpr_lastupdate = time_second; } if (new->ndpr_raf_onlink && (pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { int e; if ((e = nd6_prefix_onlink(pr)) != 0) { nd6log((LOG_ERR, "prelist_update: failed to make " "the prefix %s/%d on-link on %s " "(errno=%d)\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); /* proceed anyway. XXX: is it correct? */ } } if (dr && pfxrtr_lookup(pr, dr) == NULL) pfxrtr_add(pr, dr); } else { struct nd_prefix *newpr = NULL; newprefix = 1; if (new->ndpr_vltime == 0) goto end; if (new->ndpr_raf_onlink == 0 && new->ndpr_raf_auto == 0) goto end; error = nd6_prelist_add(new, dr, &newpr); if (error != 0 || newpr == NULL) { nd6log((LOG_NOTICE, "prelist_update: " "nd6_prelist_add failed for %s/%d on %s " "errno=%d, returnpr=%p\n", ip6_sprintf(ip6buf, &new->ndpr_prefix.sin6_addr), new->ndpr_plen, if_name(new->ndpr_ifp), error, newpr)); goto end; /* we should just give up in this case. */ } /* * XXX: from the ND point of view, we can ignore a prefix * with the on-link bit being zero. However, we need a * prefix structure for references from autoconfigured * addresses. Thus, we explicitly make sure that the prefix * itself expires now. */ if (newpr->ndpr_raf_onlink == 0) { newpr->ndpr_vltime = 0; newpr->ndpr_pltime = 0; in6_init_prefix_ltimes(newpr); } pr = newpr; } /* * Address autoconfiguration based on Section 5.5.3 of RFC 2462. * Note that pr must be non NULL at this point. */ /* 5.5.3 (a). Ignore the prefix without the A bit set. */ if (!new->ndpr_raf_auto) goto end; /* * 5.5.3 (b). the link-local prefix should have been ignored in * nd6_ra_input. */ /* 5.5.3 (c). Consistency check on lifetimes: pltime <= vltime. */ if (new->ndpr_pltime > new->ndpr_vltime) { error = EINVAL; /* XXX: won't be used */ goto end; } /* * 5.5.3 (d). If the prefix advertised is not equal to the prefix of * an address configured by stateless autoconfiguration already in the * list of addresses associated with the interface, and the Valid * Lifetime is not 0, form an address. We first check if we have * a matching prefix. * Note: we apply a clarification in rfc2462bis-02 here. We only * consider autoconfigured addresses while RFC2462 simply said * "address". */ TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { struct in6_ifaddr *ifa6; u_int32_t remaininglifetime; if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; /* * We only consider autoconfigured addresses as per rfc2462bis. */ if (!(ifa6->ia6_flags & IN6_IFF_AUTOCONF)) continue; /* * Spec is not clear here, but I believe we should concentrate * on unicast (i.e. not anycast) addresses. * XXX: other ia6_flags? detached or duplicated? */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0) continue; /* * Ignore the address if it is not associated with a prefix * or is associated with a prefix that is different from this * one. (pr is never NULL here) */ if (ifa6->ia6_ndpr != pr) continue; if (ia6_match == NULL) /* remember the first one */ ia6_match = ifa6; /* * An already autoconfigured address matched. Now that we * are sure there is at least one matched address, we can * proceed to 5.5.3. (e): update the lifetimes according to the * "two hours" rule and the privacy extension. * We apply some clarifications in rfc2462bis: * - use remaininglifetime instead of storedlifetime as a * variable name * - remove the dead code in the "two-hour" rule */ #define TWOHOUR (120*60) lt6_tmp = ifa6->ia6_lifetime; if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME) remaininglifetime = ND6_INFINITE_LIFETIME; else if (time_second - ifa6->ia6_updatetime > lt6_tmp.ia6t_vltime) { /* * The case of "invalid" address. We should usually * not see this case. */ remaininglifetime = 0; } else remaininglifetime = lt6_tmp.ia6t_vltime - (time_second - ifa6->ia6_updatetime); /* when not updating, keep the current stored lifetime. */ lt6_tmp.ia6t_vltime = remaininglifetime; if (TWOHOUR < new->ndpr_vltime || remaininglifetime < new->ndpr_vltime) { lt6_tmp.ia6t_vltime = new->ndpr_vltime; } else if (remaininglifetime <= TWOHOUR) { if (auth) { lt6_tmp.ia6t_vltime = new->ndpr_vltime; } } else { /* * new->ndpr_vltime <= TWOHOUR && * TWOHOUR < remaininglifetime */ lt6_tmp.ia6t_vltime = TWOHOUR; } /* The 2 hour rule is not imposed for preferred lifetime. */ lt6_tmp.ia6t_pltime = new->ndpr_pltime; in6_init_address_ltimes(pr, <6_tmp); /* * We need to treat lifetimes for temporary addresses * differently, according to * draft-ietf-ipv6-privacy-addrs-v2-01.txt 3.3 (1); * we only update the lifetimes when they are in the maximum * intervals. */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { u_int32_t maxvltime, maxpltime; if (V_ip6_temp_valid_lifetime > (u_int32_t)((time_second - ifa6->ia6_createtime) + V_ip6_desync_factor)) { maxvltime = V_ip6_temp_valid_lifetime - (time_second - ifa6->ia6_createtime) - V_ip6_desync_factor; } else maxvltime = 0; if (V_ip6_temp_preferred_lifetime > (u_int32_t)((time_second - ifa6->ia6_createtime) + V_ip6_desync_factor)) { maxpltime = V_ip6_temp_preferred_lifetime - (time_second - ifa6->ia6_createtime) - V_ip6_desync_factor; } else maxpltime = 0; if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME || lt6_tmp.ia6t_vltime > maxvltime) { lt6_tmp.ia6t_vltime = maxvltime; } if (lt6_tmp.ia6t_pltime == ND6_INFINITE_LIFETIME || lt6_tmp.ia6t_pltime > maxpltime) { lt6_tmp.ia6t_pltime = maxpltime; } } ifa6->ia6_lifetime = lt6_tmp; ifa6->ia6_updatetime = time_second; } if (ia6_match == NULL && new->ndpr_vltime) { int ifidlen; /* * 5.5.3 (d) (continued) * No address matched and the valid lifetime is non-zero. * Create a new address. */ /* * Prefix Length check: * If the sum of the prefix length and interface identifier * length does not equal 128 bits, the Prefix Information * option MUST be ignored. The length of the interface * identifier is defined in a separate link-type specific * document. */ ifidlen = in6_if2idlen(ifp); if (ifidlen < 0) { /* this should not happen, so we always log it. */ log(LOG_ERR, "prelist_update: IFID undefined (%s)\n", if_name(ifp)); goto end; } if (ifidlen + pr->ndpr_plen != 128) { nd6log((LOG_INFO, "prelist_update: invalid prefixlen " "%d for %s, ignored\n", pr->ndpr_plen, if_name(ifp))); goto end; } if ((ia6 = in6_ifadd(new, mcast)) != NULL) { /* * note that we should use pr (not new) for reference. */ pr->ndpr_refcnt++; ia6->ia6_ndpr = pr; /* * RFC 3041 3.3 (2). * When a new public address is created as described * in RFC2462, also create a new temporary address. * * RFC 3041 3.5. * When an interface connects to a new link, a new * randomized interface identifier should be generated * immediately together with a new set of temporary * addresses. Thus, we specifiy 1 as the 2nd arg of * in6_tmpifadd(). */ if (V_ip6_use_tempaddr) { int e; if ((e = in6_tmpifadd(ia6, 1, 1)) != 0) { nd6log((LOG_NOTICE, "prelist_update: " "failed to create a temporary " "address, errno=%d\n", e)); } } /* * A newly added address might affect the status * of other addresses, so we check and update it. * XXX: what if address duplication happens? */ pfxlist_onlink_check(); } else { /* just set an error. do not bark here. */ error = EADDRNOTAVAIL; /* XXX: might be unused. */ } } end: splx(s); return error; } /* * A supplement function used in the on-link detection below; * detect if a given prefix has a (probably) reachable advertising router. * XXX: lengthy function name... */ static struct nd_pfxrouter * find_pfxlist_reachable_router(struct nd_prefix *pr) { struct nd_pfxrouter *pfxrtr; struct rtentry *rt; struct llinfo_nd6 *ln; for (pfxrtr = LIST_FIRST(&pr->ndpr_advrtrs); pfxrtr; pfxrtr = LIST_NEXT(pfxrtr, pfr_entry)) { if ((rt = nd6_lookup(&pfxrtr->router->rtaddr, 0, pfxrtr->router->ifp)) && (ln = (struct llinfo_nd6 *)rt->rt_llinfo) && ND6_IS_LLINFO_PROBREACH(ln)) break; /* found */ } return (pfxrtr); } /* * Check if each prefix in the prefix list has at least one available router * that advertised the prefix (a router is "available" if its neighbor cache * entry is reachable or probably reachable). * If the check fails, the prefix may be off-link, because, for example, * we have moved from the network but the lifetime of the prefix has not * expired yet. So we should not use the prefix if there is another prefix * that has an available router. * But, if there is no prefix that has an available router, we still regards * all the prefixes as on-link. This is because we can't tell if all the * routers are simply dead or if we really moved from the network and there * is no router around us. */ void pfxlist_onlink_check() { INIT_VNET_INET6(curvnet); struct nd_prefix *pr; struct in6_ifaddr *ifa; struct nd_defrouter *dr; struct nd_pfxrouter *pfxrtr = NULL; /* * Check if there is a prefix that has a reachable advertising * router. */ for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) { if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr)) break; } /* * If we have no such prefix, check whether we still have a router * that does not advertise any prefixes. */ if (pr == NULL) { for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { struct nd_prefix *pr0; for (pr0 = V_nd_prefix.lh_first; pr0; pr0 = pr0->ndpr_next) { if ((pfxrtr = pfxrtr_lookup(pr0, dr)) != NULL) break; } if (pfxrtr != NULL) break; } } if (pr != NULL || (TAILQ_FIRST(&V_nd_defrouter) && pfxrtr == NULL)) { /* * There is at least one prefix that has a reachable router, * or at least a router which probably does not advertise * any prefixes. The latter would be the case when we move * to a new link where we have a router that does not provide * prefixes and we configure an address by hand. * Detach prefixes which have no reachable advertising * router, and attach other prefixes. */ for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) { /* XXX: a link-local prefix should never be detached */ if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; /* * we aren't interested in prefixes without the L bit * set. */ if (pr->ndpr_raf_onlink == 0) continue; if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && find_pfxlist_reachable_router(pr) == NULL) pr->ndpr_stateflags |= NDPRF_DETACHED; if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 && find_pfxlist_reachable_router(pr) != 0) pr->ndpr_stateflags &= ~NDPRF_DETACHED; } } else { /* there is no prefix that has a reachable router */ for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) { if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; if (pr->ndpr_raf_onlink == 0) continue; if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0) pr->ndpr_stateflags &= ~NDPRF_DETACHED; } } /* * Remove each interface route associated with a (just) detached * prefix, and reinstall the interface route for a (just) attached * prefix. Note that all attempt of reinstallation does not * necessarily success, when a same prefix is shared among multiple * interfaces. Such cases will be handled in nd6_prefix_onlink, * so we don't have to care about them. */ for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) { int e; char ip6buf[INET6_ADDRSTRLEN]; if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; if (pr->ndpr_raf_onlink == 0) continue; if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 && (pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { if ((e = nd6_prefix_offlink(pr)) != 0) { nd6log((LOG_ERR, "pfxlist_onlink_check: failed to " "make %s/%d offlink, errno=%d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, e)); } } if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && (pr->ndpr_stateflags & NDPRF_ONLINK) == 0 && pr->ndpr_raf_onlink) { if ((e = nd6_prefix_onlink(pr)) != 0) { nd6log((LOG_ERR, "pfxlist_onlink_check: failed to " "make %s/%d onlink, errno=%d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, e)); } } } /* * Changes on the prefix status might affect address status as well. * Make sure that all addresses derived from an attached prefix are * attached, and that all addresses derived from a detached prefix are * detached. Note, however, that a manually configured address should * always be attached. * The precise detection logic is same as the one for prefixes. */ for (ifa = V_in6_ifaddr; ifa; ifa = ifa->ia_next) { if (!(ifa->ia6_flags & IN6_IFF_AUTOCONF)) continue; if (ifa->ia6_ndpr == NULL) { /* * This can happen when we first configure the address * (i.e. the address exists, but the prefix does not). * XXX: complicated relationships... */ continue; } if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) break; } if (ifa) { for (ifa = V_in6_ifaddr; ifa; ifa = ifa->ia_next) { if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ifa->ia6_ndpr == NULL) /* XXX: see above. */ continue; if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) { if (ifa->ia6_flags & IN6_IFF_DETACHED) { ifa->ia6_flags &= ~IN6_IFF_DETACHED; ifa->ia6_flags |= IN6_IFF_TENTATIVE; nd6_dad_start((struct ifaddr *)ifa, 0); } } else { ifa->ia6_flags |= IN6_IFF_DETACHED; } } } else { for (ifa = V_in6_ifaddr; ifa; ifa = ifa->ia_next) { if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ifa->ia6_flags & IN6_IFF_DETACHED) { ifa->ia6_flags &= ~IN6_IFF_DETACHED; ifa->ia6_flags |= IN6_IFF_TENTATIVE; /* Do we need a delay in this case? */ nd6_dad_start((struct ifaddr *)ifa, 0); } } } } int nd6_prefix_onlink(struct nd_prefix *pr) { INIT_VNET_INET6(curvnet); struct ifaddr *ifa; struct ifnet *ifp = pr->ndpr_ifp; struct sockaddr_in6 mask6; struct nd_prefix *opr; u_long rtflags; int error = 0; struct rtentry *rt = NULL; char ip6buf[INET6_ADDRSTRLEN]; /* sanity check */ if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { nd6log((LOG_ERR, "nd6_prefix_onlink: %s/%d is already on-link\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen)); return (EEXIST); } /* * Add the interface route associated with the prefix. Before * installing the route, check if there's the same prefix on another * interface, and the prefix has already installed the interface route. * Although such a configuration is expected to be rare, we explicitly * allow it. */ for (opr = V_nd_prefix.lh_first; opr; opr = opr->ndpr_next) { if (opr == pr) continue; if ((opr->ndpr_stateflags & NDPRF_ONLINK) == 0) continue; if (opr->ndpr_plen == pr->ndpr_plen && in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) return (0); } /* * We prefer link-local addresses as the associated interface address. */ /* search for a link-local addr */ ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY | IN6_IFF_ANYCAST); if (ifa == NULL) { /* XXX: freebsd does not have ifa_ifwithaf */ TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family == AF_INET6) break; } /* should we care about ia6_flags? */ } if (ifa == NULL) { /* * This can still happen, when, for example, we receive an RA * containing a prefix with the L bit set and the A bit clear, * after removing all IPv6 addresses on the receiving * interface. This should, of course, be rare though. */ nd6log((LOG_NOTICE, "nd6_prefix_onlink: failed to find any ifaddr" " to add route for a prefix(%s/%d) on %s\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(ifp))); return (0); } /* * in6_ifinit() sets nd6_rtrequest to ifa_rtrequest for all ifaddrs. * ifa->ifa_rtrequest = nd6_rtrequest; */ bzero(&mask6, sizeof(mask6)); mask6.sin6_len = sizeof(mask6); mask6.sin6_addr = pr->ndpr_mask; rtflags = ifa->ifa_flags | RTF_CLONING | RTF_UP; if (nd6_need_cache(ifp)) { /* explicitly set in case ifa_flags does not set the flag. */ rtflags |= RTF_CLONING; } else { /* * explicitly clear the cloning bit in case ifa_flags sets it. */ rtflags &= ~RTF_CLONING; } error = rtrequest(RTM_ADD, (struct sockaddr *)&pr->ndpr_prefix, ifa->ifa_addr, (struct sockaddr *)&mask6, rtflags, &rt); if (error == 0) { if (rt != NULL) /* this should be non NULL, though */ nd6_rtmsg(RTM_ADD, rt); pr->ndpr_stateflags |= NDPRF_ONLINK; } else { char ip6bufg[INET6_ADDRSTRLEN], ip6bufm[INET6_ADDRSTRLEN]; nd6log((LOG_ERR, "nd6_prefix_onlink: failed to add route for a" " prefix (%s/%d) on %s, gw=%s, mask=%s, flags=%lx " "errno = %d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(ifp), ip6_sprintf(ip6bufg, &((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr), ip6_sprintf(ip6bufm, &mask6.sin6_addr), rtflags, error)); } if (rt != NULL) { RT_LOCK(rt); RT_REMREF(rt); RT_UNLOCK(rt); } return (error); } int nd6_prefix_offlink(struct nd_prefix *pr) { INIT_VNET_INET6(curvnet); int error = 0; struct ifnet *ifp = pr->ndpr_ifp; struct nd_prefix *opr; struct sockaddr_in6 sa6, mask6; struct rtentry *rt = NULL; char ip6buf[INET6_ADDRSTRLEN]; /* sanity check */ if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { nd6log((LOG_ERR, "nd6_prefix_offlink: %s/%d is already off-link\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen)); return (EEXIST); } bzero(&sa6, sizeof(sa6)); sa6.sin6_family = AF_INET6; sa6.sin6_len = sizeof(sa6); bcopy(&pr->ndpr_prefix.sin6_addr, &sa6.sin6_addr, sizeof(struct in6_addr)); bzero(&mask6, sizeof(mask6)); mask6.sin6_family = AF_INET6; mask6.sin6_len = sizeof(sa6); bcopy(&pr->ndpr_mask, &mask6.sin6_addr, sizeof(struct in6_addr)); error = rtrequest(RTM_DELETE, (struct sockaddr *)&sa6, NULL, (struct sockaddr *)&mask6, 0, &rt); if (error == 0) { pr->ndpr_stateflags &= ~NDPRF_ONLINK; /* report the route deletion to the routing socket. */ if (rt != NULL) nd6_rtmsg(RTM_DELETE, rt); /* * There might be the same prefix on another interface, * the prefix which could not be on-link just because we have * the interface route (see comments in nd6_prefix_onlink). * If there's one, try to make the prefix on-link on the * interface. */ for (opr = V_nd_prefix.lh_first; opr; opr = opr->ndpr_next) { if (opr == pr) continue; if ((opr->ndpr_stateflags & NDPRF_ONLINK) != 0) continue; /* * KAME specific: detached prefixes should not be * on-link. */ if ((opr->ndpr_stateflags & NDPRF_DETACHED) != 0) continue; if (opr->ndpr_plen == pr->ndpr_plen && in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) { int e; if ((e = nd6_prefix_onlink(opr)) != 0) { nd6log((LOG_ERR, "nd6_prefix_offlink: failed to " "recover a prefix %s/%d from %s " "to %s (errno = %d)\n", ip6_sprintf(ip6buf, &opr->ndpr_prefix.sin6_addr), opr->ndpr_plen, if_name(ifp), if_name(opr->ndpr_ifp), e)); } } } } else { /* XXX: can we still set the NDPRF_ONLINK flag? */ nd6log((LOG_ERR, "nd6_prefix_offlink: failed to delete route: " "%s/%d on %s (errno = %d)\n", ip6_sprintf(ip6buf, &sa6.sin6_addr), pr->ndpr_plen, if_name(ifp), error)); } if (rt != NULL) { RTFREE(rt); } return (error); } static struct in6_ifaddr * in6_ifadd(struct nd_prefixctl *pr, int mcast) { INIT_VNET_INET6(curvnet); struct ifnet *ifp = pr->ndpr_ifp; struct ifaddr *ifa; struct in6_aliasreq ifra; struct in6_ifaddr *ia, *ib; int error, plen0; struct in6_addr mask; int prefixlen = pr->ndpr_plen; int updateflags; char ip6buf[INET6_ADDRSTRLEN]; in6_prefixlen2mask(&mask, prefixlen); /* * find a link-local address (will be interface ID). * Is it really mandatory? Theoretically, a global or a site-local * address can be configured without a link-local address, if we * have a unique interface identifier... * * it is not mandatory to have a link-local address, we can generate * interface identifier on the fly. we do this because: * (1) it should be the easiest way to find interface identifier. * (2) RFC2462 5.4 suggesting the use of the same interface identifier * for multiple addresses on a single interface, and possible shortcut * of DAD. we omitted DAD for this reason in the past. * (3) a user can prevent autoconfiguration of global address * by removing link-local address by hand (this is partly because we * don't have other way to control the use of IPv6 on an interface. * this has been our design choice - cf. NRL's "ifconfig auto"). * (4) it is easier to manage when an interface has addresses * with the same interface identifier, than to have multiple addresses * with different interface identifiers. */ ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0); /* 0 is OK? */ if (ifa) ib = (struct in6_ifaddr *)ifa; else return NULL; /* prefixlen + ifidlen must be equal to 128 */ plen0 = in6_mask2len(&ib->ia_prefixmask.sin6_addr, NULL); if (prefixlen != plen0) { nd6log((LOG_INFO, "in6_ifadd: wrong prefixlen for %s " "(prefix=%d ifid=%d)\n", if_name(ifp), prefixlen, 128 - plen0)); return NULL; } /* make ifaddr */ bzero(&ifra, sizeof(ifra)); /* * in6_update_ifa() does not use ifra_name, but we accurately set it * for safety. */ strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name)); ifra.ifra_addr.sin6_family = AF_INET6; ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6); /* prefix */ ifra.ifra_addr.sin6_addr = pr->ndpr_prefix.sin6_addr; ifra.ifra_addr.sin6_addr.s6_addr32[0] &= mask.s6_addr32[0]; ifra.ifra_addr.sin6_addr.s6_addr32[1] &= mask.s6_addr32[1]; ifra.ifra_addr.sin6_addr.s6_addr32[2] &= mask.s6_addr32[2]; ifra.ifra_addr.sin6_addr.s6_addr32[3] &= mask.s6_addr32[3]; /* interface ID */ ifra.ifra_addr.sin6_addr.s6_addr32[0] |= (ib->ia_addr.sin6_addr.s6_addr32[0] & ~mask.s6_addr32[0]); ifra.ifra_addr.sin6_addr.s6_addr32[1] |= (ib->ia_addr.sin6_addr.s6_addr32[1] & ~mask.s6_addr32[1]); ifra.ifra_addr.sin6_addr.s6_addr32[2] |= (ib->ia_addr.sin6_addr.s6_addr32[2] & ~mask.s6_addr32[2]); ifra.ifra_addr.sin6_addr.s6_addr32[3] |= (ib->ia_addr.sin6_addr.s6_addr32[3] & ~mask.s6_addr32[3]); /* new prefix mask. */ ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_prefixmask.sin6_family = AF_INET6; bcopy(&mask, &ifra.ifra_prefixmask.sin6_addr, sizeof(ifra.ifra_prefixmask.sin6_addr)); /* lifetimes. */ ifra.ifra_lifetime.ia6t_vltime = pr->ndpr_vltime; ifra.ifra_lifetime.ia6t_pltime = pr->ndpr_pltime; /* XXX: scope zone ID? */ ifra.ifra_flags |= IN6_IFF_AUTOCONF; /* obey autoconf */ /* * Make sure that we do not have this address already. This should * usually not happen, but we can still see this case, e.g., if we * have manually configured the exact address to be configured. */ if (in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr) != NULL) { /* this should be rare enough to make an explicit log */ log(LOG_INFO, "in6_ifadd: %s is already configured\n", ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr)); return (NULL); } /* * Allocate ifaddr structure, link into chain, etc. * If we are going to create a new address upon receiving a multicasted * RA, we need to impose a random delay before starting DAD. * [draft-ietf-ipv6-rfc2462bis-02.txt, Section 5.4.2] */ updateflags = 0; if (mcast) updateflags |= IN6_IFAUPDATE_DADDELAY; if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) { nd6log((LOG_ERR, "in6_ifadd: failed to make ifaddr %s on %s (errno=%d)\n", ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr), if_name(ifp), error)); return (NULL); /* ifaddr must not have been allocated. */ } ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); return (ia); /* this is always non-NULL */ } /* * ia0 - corresponding public address */ int in6_tmpifadd(const struct in6_ifaddr *ia0, int forcegen, int delay) { INIT_VNET_INET6(curvnet); struct ifnet *ifp = ia0->ia_ifa.ifa_ifp; struct in6_ifaddr *newia, *ia; struct in6_aliasreq ifra; int i, error; int trylimit = 3; /* XXX: adhoc value */ int updateflags; u_int32_t randid[2]; time_t vltime0, pltime0; bzero(&ifra, sizeof(ifra)); strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name)); ifra.ifra_addr = ia0->ia_addr; /* copy prefix mask */ ifra.ifra_prefixmask = ia0->ia_prefixmask; /* clear the old IFID */ for (i = 0; i < 4; i++) { ifra.ifra_addr.sin6_addr.s6_addr32[i] &= ifra.ifra_prefixmask.sin6_addr.s6_addr32[i]; } again: if (in6_get_tmpifid(ifp, (u_int8_t *)randid, (const u_int8_t *)&ia0->ia_addr.sin6_addr.s6_addr[8], forcegen)) { nd6log((LOG_NOTICE, "in6_tmpifadd: failed to find a good " "random IFID\n")); return (EINVAL); } ifra.ifra_addr.sin6_addr.s6_addr32[2] |= (randid[0] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[2])); ifra.ifra_addr.sin6_addr.s6_addr32[3] |= (randid[1] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[3])); /* * in6_get_tmpifid() quite likely provided a unique interface ID. * However, we may still have a chance to see collision, because * there may be a time lag between generation of the ID and generation * of the address. So, we'll do one more sanity check. */ for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) { if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &ifra.ifra_addr.sin6_addr)) { if (trylimit-- == 0) { /* * Give up. Something strange should have * happened. */ nd6log((LOG_NOTICE, "in6_tmpifadd: failed to " "find a unique random IFID\n")); return (EEXIST); } forcegen = 1; goto again; } } /* * The Valid Lifetime is the lower of the Valid Lifetime of the * public address or TEMP_VALID_LIFETIME. * The Preferred Lifetime is the lower of the Preferred Lifetime * of the public address or TEMP_PREFERRED_LIFETIME - * DESYNC_FACTOR. */ if (ia0->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { vltime0 = IFA6_IS_INVALID(ia0) ? 0 : (ia0->ia6_lifetime.ia6t_vltime - (time_second - ia0->ia6_updatetime)); if (vltime0 > V_ip6_temp_valid_lifetime) vltime0 = V_ip6_temp_valid_lifetime; } else vltime0 = V_ip6_temp_valid_lifetime; if (ia0->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { pltime0 = IFA6_IS_DEPRECATED(ia0) ? 0 : (ia0->ia6_lifetime.ia6t_pltime - (time_second - ia0->ia6_updatetime)); if (pltime0 > V_ip6_temp_preferred_lifetime - V_ip6_desync_factor){ pltime0 = V_ip6_temp_preferred_lifetime - V_ip6_desync_factor; } } else pltime0 = V_ip6_temp_preferred_lifetime - V_ip6_desync_factor; ifra.ifra_lifetime.ia6t_vltime = vltime0; ifra.ifra_lifetime.ia6t_pltime = pltime0; /* * A temporary address is created only if this calculated Preferred * Lifetime is greater than REGEN_ADVANCE time units. */ if (ifra.ifra_lifetime.ia6t_pltime <= V_ip6_temp_regen_advance) return (0); /* XXX: scope zone ID? */ ifra.ifra_flags |= (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY); /* allocate ifaddr structure, link into chain, etc. */ updateflags = 0; if (delay) updateflags |= IN6_IFAUPDATE_DADDELAY; if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) return (error); newia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); if (newia == NULL) { /* XXX: can it happen? */ nd6log((LOG_ERR, "in6_tmpifadd: ifa update succeeded, but we got " "no ifaddr\n")); return (EINVAL); /* XXX */ } newia->ia6_ndpr = ia0->ia6_ndpr; newia->ia6_ndpr->ndpr_refcnt++; /* * A newly added address might affect the status of other addresses. * XXX: when the temporary address is generated with a new public * address, the onlink check is redundant. However, it would be safe * to do the check explicitly everywhere a new address is generated, * and, in fact, we surely need the check when we create a new * temporary address due to deprecation of an old temporary address. */ pfxlist_onlink_check(); return (0); } static int in6_init_prefix_ltimes(struct nd_prefix *ndpr) { if (ndpr->ndpr_pltime == ND6_INFINITE_LIFETIME) ndpr->ndpr_preferred = 0; else ndpr->ndpr_preferred = time_second + ndpr->ndpr_pltime; if (ndpr->ndpr_vltime == ND6_INFINITE_LIFETIME) ndpr->ndpr_expire = 0; else ndpr->ndpr_expire = time_second + ndpr->ndpr_vltime; return 0; } static void in6_init_address_ltimes(struct nd_prefix *new, struct in6_addrlifetime *lt6) { /* init ia6t_expire */ if (lt6->ia6t_vltime == ND6_INFINITE_LIFETIME) lt6->ia6t_expire = 0; else { lt6->ia6t_expire = time_second; lt6->ia6t_expire += lt6->ia6t_vltime; } /* init ia6t_preferred */ if (lt6->ia6t_pltime == ND6_INFINITE_LIFETIME) lt6->ia6t_preferred = 0; else { lt6->ia6t_preferred = time_second; lt6->ia6t_preferred += lt6->ia6t_pltime; } } /* * Delete all the routing table entries that use the specified gateway. * XXX: this function causes search through all entries of routing table, so * it shouldn't be called when acting as a router. */ void rt6_flush(struct in6_addr *gateway, struct ifnet *ifp) { INIT_VNET_NET(curvnet); struct radix_node_head *rnh = V_rt_tables[0][AF_INET6]; int s = splnet(); /* We'll care only link-local addresses */ if (!IN6_IS_ADDR_LINKLOCAL(gateway)) { splx(s); return; } RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, rt6_deleteroute, (void *)gateway); RADIX_NODE_HEAD_UNLOCK(rnh); splx(s); } static int rt6_deleteroute(struct radix_node *rn, void *arg) { #define SIN6(s) ((struct sockaddr_in6 *)s) struct rtentry *rt = (struct rtentry *)rn; struct in6_addr *gate = (struct in6_addr *)arg; if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6) return (0); if (!IN6_ARE_ADDR_EQUAL(gate, &SIN6(rt->rt_gateway)->sin6_addr)) { return (0); } /* * Do not delete a static route. * XXX: this seems to be a bit ad-hoc. Should we consider the * 'cloned' bit instead? */ if ((rt->rt_flags & RTF_STATIC) != 0) return (0); /* * We delete only host route. This means, in particular, we don't * delete default route. */ if ((rt->rt_flags & RTF_HOST) == 0) return (0); return (rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0)); #undef SIN6 } int nd6_setdefaultiface(int ifindex) { INIT_VNET_NET(curvnet); INIT_VNET_INET6(curvnet); int error = 0; if (ifindex < 0 || V_if_index < ifindex) return (EINVAL); if (ifindex != 0 && !ifnet_byindex(ifindex)) return (EINVAL); if (V_nd6_defifindex != ifindex) { V_nd6_defifindex = ifindex; if (V_nd6_defifindex > 0) V_nd6_defifp = ifnet_byindex(V_nd6_defifindex); else V_nd6_defifp = NULL; /* * Our current implementation assumes one-to-one maping between * interfaces and links, so it would be natural to use the * default interface as the default link. */ scope6_setdefault(V_nd6_defifp); } return (error); } Index: head/sys/netinet6/raw_ip6.c =================================================================== --- head/sys/netinet6/raw_ip6.c (revision 185087) +++ head/sys/netinet6/raw_ip6.c (revision 185088) @@ -1,834 +1,836 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipsec.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif /* IPSEC */ #include #define satosin6(sa) ((struct sockaddr_in6 *)(sa)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) /* * Raw interface to IP6 protocol. */ extern struct inpcbhead ripcb; extern struct inpcbinfo ripcbinfo; extern u_long rip_sendspace; extern u_long rip_recvspace; +#ifdef VIMAGE_GLOBALS struct rip6stat rip6stat; +#endif /* * Hooks for multicast forwarding. */ struct socket *ip6_mrouter = NULL; int (*ip6_mrouter_set)(struct socket *, struct sockopt *); int (*ip6_mrouter_get)(struct socket *, struct sockopt *); int (*ip6_mrouter_done)(void); int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *); int (*mrt6_ioctl)(int, caddr_t); /* * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. */ int rip6_input(struct mbuf **mp, int *offp, int proto) { INIT_VNET_INET(curvnet); INIT_VNET_INET6(curvnet); #ifdef IPSEC INIT_VNET_IPSEC(curvnet); #endif struct mbuf *m = *mp; register struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); register struct inpcb *in6p; struct inpcb *last = 0; struct mbuf *opts = NULL; struct sockaddr_in6 fromsa; V_rip6stat.rip6s_ipackets++; if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) { /* XXX Send icmp6 host/port unreach? */ m_freem(m); return (IPPROTO_DONE); } init_sin6(&fromsa, m); /* general init */ INP_INFO_RLOCK(&V_ripcbinfo); LIST_FOREACH(in6p, &V_ripcb, inp_list) { if ((in6p->in6p_vflag & INP_IPV6) == 0) continue; if (in6p->in6p_ip6_nxt && in6p->in6p_ip6_nxt != proto) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) continue; INP_RLOCK(in6p); if (in6p->in6p_cksum != -1) { V_rip6stat.rip6s_isum++; if (in6_cksum(m, proto, *offp, m->m_pkthdr.len - *offp)) { INP_RUNLOCK(in6p); V_rip6stat.rip6s_badsum++; continue; } } if (last) { struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); #ifdef IPSEC /* * Check AH/ESP integrity. */ if (n && ipsec6_in_reject(n, last)) { m_freem(n); V_ipsec6stat.in_polvio++; /* Do not inject data into pcb. */ } else #endif /* IPSEC */ if (n) { if (last->in6p_flags & IN6P_CONTROLOPTS || last->in6p_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(last, n, &opts); /* strip intermediate headers */ m_adj(n, *offp); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { m_freem(n); if (opts) m_freem(opts); V_rip6stat.rip6s_fullsock++; } else sorwakeup(last->in6p_socket); opts = NULL; } INP_RUNLOCK(last); } last = in6p; } INP_INFO_RUNLOCK(&V_ripcbinfo); #ifdef IPSEC /* * Check AH/ESP integrity. */ if (last && ipsec6_in_reject(m, last)) { m_freem(m); V_ipsec6stat.in_polvio++; V_ip6stat.ip6s_delivered--; /* Do not inject data into pcb. */ INP_RUNLOCK(last); } else #endif /* IPSEC */ if (last) { if (last->in6p_flags & IN6P_CONTROLOPTS || last->in6p_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(last, m, &opts); /* Strip intermediate headers. */ m_adj(m, *offp); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&fromsa, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); V_rip6stat.rip6s_fullsock++; } else sorwakeup(last->in6p_socket); INP_RUNLOCK(last); } else { V_rip6stat.rip6s_nosock++; if (m->m_flags & M_MCAST) V_rip6stat.rip6s_nosockmcast++; if (proto == IPPROTO_NONE) m_freem(m); else { char *prvnxtp = ip6_get_prevhdr(m, *offp); /* XXX */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, prvnxtp - mtod(m, char *)); } V_ip6stat.ip6s_delivered--; } return (IPPROTO_DONE); } void rip6_ctlinput(int cmd, struct sockaddr *sa, void *d) { INIT_VNET_INET(curvnet); struct ip6_hdr *ip6; struct mbuf *m; int off = 0; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; void *cmdarg; struct inpcb *(*notify)(struct inpcb *, int) = in6_rtchange; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if ((unsigned)cmd >= PRC_NCMDS) return; if (PRC_IS_REDIRECT(cmd)) notify = in6_rtchange, d = NULL; else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return; /* * If the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; cmdarg = NULL; sa6_src = &sa6_any; } (void) in6_pcbnotify(&V_ripcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify); } /* * Generate IPv6 header and pass packet to ip6_output. Tack on options user * may have setup with control call. */ int #if __STDC__ rip6_output(struct mbuf *m, ...) #else rip6_output(m, va_alist) struct mbuf *m; va_dcl #endif { INIT_VNET_INET6(curvnet); struct mbuf *control; struct socket *so; struct sockaddr_in6 *dstsock; struct in6_addr *dst; struct ip6_hdr *ip6; struct inpcb *in6p; u_int plen = m->m_pkthdr.len; int error = 0; struct ip6_pktopts opt, *optp; struct ifnet *oifp = NULL; int type = 0, code = 0; /* for ICMPv6 output statistics only */ int scope_ambiguous = 0; struct in6_addr *in6a; va_list ap; va_start(ap, m); so = va_arg(ap, struct socket *); dstsock = va_arg(ap, struct sockaddr_in6 *); control = va_arg(ap, struct mbuf *); va_end(ap); in6p = sotoin6pcb(so); INP_WLOCK(in6p); dst = &dstsock->sin6_addr; if (control) { if ((error = ip6_setpktopts(control, &opt, in6p->in6p_outputopts, so->so_cred, so->so_proto->pr_protocol)) != 0) { goto bad; } optp = &opt; } else optp = in6p->in6p_outputopts; /* * Check and convert scope zone ID into internal form. * * XXX: we may still need to determine the zone later. */ if (!(so->so_state & SS_ISCONNECTED)) { if (dstsock->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(dstsock, V_ip6_use_defzone)) != 0) goto bad; } /* * For an ICMPv6 packet, we should know its type and code to update * statistics. */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { struct icmp6_hdr *icmp6; if (m->m_len < sizeof(struct icmp6_hdr) && (m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) { error = ENOBUFS; goto bad; } icmp6 = mtod(m, struct icmp6_hdr *); type = icmp6->icmp6_type; code = icmp6->icmp6_code; } M_PREPEND(m, sizeof(*ip6), M_DONTWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } ip6 = mtod(m, struct ip6_hdr *); /* * Source address selection. */ if ((in6a = in6_selectsrc(dstsock, optp, in6p, NULL, so->so_cred, &oifp, &error)) == NULL) { if (error == 0) error = EADDRNOTAVAIL; goto bad; } ip6->ip6_src = *in6a; if (oifp && scope_ambiguous) { /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined * (when it's required), if we can determine the outgoing * interface. determine the zone ID based on the interface. */ error = in6_setscope(&dstsock->sin6_addr, oifp, NULL); if (error != 0) goto bad; } ip6->ip6_dst = dstsock->sin6_addr; /* * Fill in the rest of the IPv6 header fields. */ ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (in6p->in6p_flowinfo & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); /* * ip6_plen will be filled in ip6_output, so not fill it here. */ ip6->ip6_nxt = in6p->in6p_ip6_nxt; ip6->ip6_hlim = in6_selecthlim(in6p, oifp); if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 || in6p->in6p_cksum != -1) { struct mbuf *n; int off; u_int16_t *p; /* Compute checksum. */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) off = offsetof(struct icmp6_hdr, icmp6_cksum); else off = in6p->in6p_cksum; if (plen < off + 1) { error = EINVAL; goto bad; } off += sizeof(struct ip6_hdr); n = m; while (n && n->m_len <= off) { off -= n->m_len; n = n->m_next; } if (!n) goto bad; p = (u_int16_t *)(mtod(n, caddr_t) + off); *p = 0; *p = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen); } error = ip6_output(m, optp, NULL, 0, in6p->in6p_moptions, &oifp, in6p); if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (oifp) icmp6_ifoutstat_inc(oifp, type, code); V_icmp6stat.icp6s_outhist[type]++; } else V_rip6stat.rip6s_opackets++; goto freectl; bad: if (m) m_freem(m); freectl: if (control) { ip6_clearpktopts(&opt, -1); m_freem(control); } INP_WUNLOCK(in6p); return (error); } /* * Raw IPv6 socket option processing. */ int rip6_ctloutput(struct socket *so, struct sockopt *sopt) { int error; if (sopt->sopt_level == IPPROTO_ICMPV6) /* * XXX: is it better to call icmp6_ctloutput() directly * from protosw? */ return (icmp6_ctloutput(so, sopt)); else if (sopt->sopt_level != IPPROTO_IPV6) return (EINVAL); error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: error = ip6_mrouter_get ? ip6_mrouter_get(so, sopt) : EOPNOTSUPP; break; case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: error = ip6_mrouter_set ? ip6_mrouter_set(so, sopt) : EOPNOTSUPP; break; case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; } return (error); } static int rip6_attach(struct socket *so, int proto, struct thread *td) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; struct icmp6_filter *filter; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("rip6_attach: inp != NULL")); error = priv_check(td, PRIV_NETINET_RAW); if (error) return (error); error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return (error); filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT); if (filter == NULL) return (ENOMEM); INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); if (error) { INP_INFO_WUNLOCK(&V_ripcbinfo); free(filter, M_PCB); return (error); } inp = (struct inpcb *)so->so_pcb; INP_INFO_WUNLOCK(&V_ripcbinfo); inp->inp_vflag |= INP_IPV6; inp->in6p_ip6_nxt = (long)proto; inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; inp->in6p_icmp6filt = filter; ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt); INP_WUNLOCK(inp); return (0); } static void rip6_detach(struct socket *so) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_detach: inp == NULL")); if (so == ip6_mrouter && ip6_mrouter_done) ip6_mrouter_done(); /* xxx: RSVP */ INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); free(inp->in6p_icmp6filt, M_PCB); in6_pcbdetach(inp); in6_pcbfree(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); } /* XXXRW: This can't ever be called. */ static void rip6_abort(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_abort: inp == NULL")); soisdisconnected(so); } static void rip6_close(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_close: inp == NULL")); soisdisconnected(so); } static int rip6_disconnect(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_disconnect: inp == NULL")); if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); inp->in6p_faddr = in6addr_any; rip6_abort(so); return (0); } static int rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { INIT_VNET_NET(so->so_vnet); INIT_VNET_INET(so->so_vnet); INIT_VNET_INET6(so->so_vnet); struct inpcb *inp; struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct ifaddr *ia = NULL; int error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_bind: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return (EINVAL); if (TAILQ_EMPTY(&V_ifnet) || addr->sin6_family != AF_INET6) return (EADDRNOTAVAIL); if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return (error); if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) && (ia = ifa_ifwithaddr((struct sockaddr *)addr)) == 0) return (EADDRNOTAVAIL); if (ia && ((struct in6_ifaddr *)ia)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { return (EADDRNOTAVAIL); } INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); inp->in6p_laddr = addr->sin6_addr; INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { INIT_VNET_NET(so->so_vnet); INIT_VNET_INET(so->so_vnet); INIT_VNET_INET6(so->so_vnet); struct inpcb *inp; struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct in6_addr *in6a = NULL; struct ifnet *ifp = NULL; int error = 0, scope_ambiguous = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_connect: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return (EINVAL); if (TAILQ_EMPTY(&V_ifnet)) return (EADDRNOTAVAIL); if (addr->sin6_family != AF_INET6) return (EAFNOSUPPORT); /* * Application should provide a proper zone ID or the use of default * zone IDs should be enabled. Unfortunately, some applications do * not behave as it should, so we need a workaround. Even if an * appropriate ID is not determined, we'll see if we can determine * the outgoing interface. If we can, determine the zone ID based on * the interface below. */ if (addr->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return (error); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); /* Source address selection. XXX: need pcblookup? */ in6a = in6_selectsrc(addr, inp->in6p_outputopts, inp, NULL, so->so_cred, &ifp, &error); if (in6a == NULL) { INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (error ? error : EADDRNOTAVAIL); } /* XXX: see above */ if (ifp && scope_ambiguous && (error = in6_setscope(&addr->sin6_addr, ifp, NULL)) != 0) { INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (error); } inp->in6p_faddr = addr->sin6_addr; inp->in6p_laddr = *in6a; soisconnected(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip6_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } static int rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { INIT_VNET_INET(so->so_vnet); struct inpcb *inp; struct sockaddr_in6 tmp; struct sockaddr_in6 *dst; int ret; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_send: inp == NULL")); /* Always copy sockaddr to avoid overwrites. */ /* Unlocked read. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { m_freem(m); return (EISCONN); } /* XXX */ bzero(&tmp, sizeof(tmp)); tmp.sin6_family = AF_INET6; tmp.sin6_len = sizeof(struct sockaddr_in6); INP_RLOCK(inp); bcopy(&inp->in6p_faddr, &tmp.sin6_addr, sizeof(struct in6_addr)); INP_RUNLOCK(inp); dst = &tmp; } else { if (nam == NULL) { m_freem(m); return (ENOTCONN); } if (nam->sa_len != sizeof(struct sockaddr_in6)) { m_freem(m); return (EINVAL); } tmp = *(struct sockaddr_in6 *)nam; dst = &tmp; if (dst->sin6_family == AF_UNSPEC) { /* * XXX: we allow this case for backward * compatibility to buggy applications that * rely on old (and wrong) kernel behavior. */ log(LOG_INFO, "rip6 SEND: address family is " "unspec. Assume AF_INET6\n"); dst->sin6_family = AF_INET6; } else if (dst->sin6_family != AF_INET6) { m_freem(m); return(EAFNOSUPPORT); } } ret = rip6_output(m, so, dst, control); return (ret); } struct pr_usrreqs rip6_usrreqs = { .pru_abort = rip6_abort, .pru_attach = rip6_attach, .pru_bind = rip6_bind, .pru_connect = rip6_connect, .pru_control = in6_control, .pru_detach = rip6_detach, .pru_disconnect = rip6_disconnect, .pru_peeraddr = in6_getpeeraddr, .pru_send = rip6_send, .pru_shutdown = rip6_shutdown, .pru_sockaddr = in6_getsockaddr, .pru_close = rip6_close, }; Index: head/sys/netinet6/scope6.c =================================================================== --- head/sys/netinet6/scope6.c (revision 185087) +++ head/sys/netinet6/scope6.c (revision 185088) @@ -1,497 +1,501 @@ /*- * Copyright (C) 2000 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: scope6.c,v 1.10 2000/07/24 13:29:31 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#ifdef ENABLE_DEFAULT_SCOPE -int ip6_use_defzone = 1; -#else -int ip6_use_defzone = 0; -#endif /* * The scope6_lock protects the global sid default stored in * sid_default below. */ static struct mtx scope6_lock; #define SCOPE6_LOCK_INIT() mtx_init(&scope6_lock, "scope6_lock", NULL, MTX_DEF) #define SCOPE6_LOCK() mtx_lock(&scope6_lock) #define SCOPE6_UNLOCK() mtx_unlock(&scope6_lock) #define SCOPE6_LOCK_ASSERT() mtx_assert(&scope6_lock, MA_OWNED) +#ifdef VIMAGE_GLOBALS static struct scope6_id sid_default; +int ip6_use_defzone; +#endif + #define SID(ifp) \ (((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->scope6_id) void scope6_init(void) { INIT_VNET_INET6(curvnet); +#ifdef ENABLE_DEFAULT_SCOPE + V_ip6_use_defzone = 1; +#else + V_ip6_use_defzone = 0; +#endif SCOPE6_LOCK_INIT(); bzero(&V_sid_default, sizeof(V_sid_default)); } struct scope6_id * scope6_ifattach(struct ifnet *ifp) { struct scope6_id *sid; sid = (struct scope6_id *)malloc(sizeof(*sid), M_IFADDR, M_WAITOK); bzero(sid, sizeof(*sid)); /* * XXX: IPV6_ADDR_SCOPE_xxx macros are not standard. * Should we rather hardcode here? */ sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = ifp->if_index; sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index; #ifdef MULTI_SCOPE /* by default, we don't care about scope boundary for these scopes. */ sid->s6id_list[IPV6_ADDR_SCOPE_SITELOCAL] = 1; sid->s6id_list[IPV6_ADDR_SCOPE_ORGLOCAL] = 1; #endif return sid; } void scope6_ifdetach(struct scope6_id *sid) { free(sid, M_IFADDR); } int scope6_set(struct ifnet *ifp, struct scope6_id *idlist) { INIT_VNET_NET(ifp->if_vnet); int i; int error = 0; struct scope6_id *sid = NULL; IF_AFDATA_LOCK(ifp); sid = SID(ifp); if (!sid) { /* paranoid? */ IF_AFDATA_UNLOCK(ifp); return (EINVAL); } /* * XXX: We need more consistency checks of the relationship among * scopes (e.g. an organization should be larger than a site). */ /* * TODO(XXX): after setting, we should reflect the changes to * interface addresses, routing table entries, PCB entries... */ SCOPE6_LOCK(); for (i = 0; i < 16; i++) { if (idlist->s6id_list[i] && idlist->s6id_list[i] != sid->s6id_list[i]) { /* * An interface zone ID must be the corresponding * interface index by definition. */ if (i == IPV6_ADDR_SCOPE_INTFACELOCAL && idlist->s6id_list[i] != ifp->if_index) { IF_AFDATA_UNLOCK(ifp); SCOPE6_UNLOCK(); return (EINVAL); } if (i == IPV6_ADDR_SCOPE_LINKLOCAL && idlist->s6id_list[i] > V_if_index) { /* * XXX: theoretically, there should be no * relationship between link IDs and interface * IDs, but we check the consistency for * safety in later use. */ IF_AFDATA_UNLOCK(ifp); SCOPE6_UNLOCK(); return (EINVAL); } /* * XXX: we must need lots of work in this case, * but we simply set the new value in this initial * implementation. */ sid->s6id_list[i] = idlist->s6id_list[i]; } } SCOPE6_UNLOCK(); IF_AFDATA_UNLOCK(ifp); return (error); } int scope6_get(struct ifnet *ifp, struct scope6_id *idlist) { /* We only need to lock the interface's afdata for SID() to work. */ IF_AFDATA_LOCK(ifp); struct scope6_id *sid = SID(ifp); if (sid == NULL) { /* paranoid? */ IF_AFDATA_UNLOCK(ifp); return (EINVAL); } SCOPE6_LOCK(); *idlist = *sid; SCOPE6_UNLOCK(); IF_AFDATA_UNLOCK(ifp); return (0); } /* * Get a scope of the address. Node-local, link-local, site-local or global. */ int in6_addrscope(struct in6_addr *addr) { int scope; if (addr->s6_addr[0] == 0xfe) { scope = addr->s6_addr[1] & 0xc0; switch (scope) { case 0x80: return IPV6_ADDR_SCOPE_LINKLOCAL; break; case 0xc0: return IPV6_ADDR_SCOPE_SITELOCAL; break; default: return IPV6_ADDR_SCOPE_GLOBAL; /* just in case */ break; } } if (addr->s6_addr[0] == 0xff) { scope = addr->s6_addr[1] & 0x0f; /* * due to other scope such as reserved, * return scope doesn't work. */ switch (scope) { case IPV6_ADDR_SCOPE_INTFACELOCAL: return IPV6_ADDR_SCOPE_INTFACELOCAL; break; case IPV6_ADDR_SCOPE_LINKLOCAL: return IPV6_ADDR_SCOPE_LINKLOCAL; break; case IPV6_ADDR_SCOPE_SITELOCAL: return IPV6_ADDR_SCOPE_SITELOCAL; break; default: return IPV6_ADDR_SCOPE_GLOBAL; break; } } /* * Regard loopback and unspecified addresses as global, since * they have no ambiguity. */ if (bcmp(&in6addr_loopback, addr, sizeof(*addr) - 1) == 0) { if (addr->s6_addr[15] == 1) /* loopback */ return IPV6_ADDR_SCOPE_LINKLOCAL; if (addr->s6_addr[15] == 0) /* unspecified */ return IPV6_ADDR_SCOPE_GLOBAL; /* XXX: correct? */ } return IPV6_ADDR_SCOPE_GLOBAL; } /* * ifp - note that this might be NULL */ void scope6_setdefault(struct ifnet *ifp) { INIT_VNET_INET6(ifp->if_vnet); /* * Currently, this function just sets the default "interfaces" * and "links" according to the given interface. * We might eventually have to separate the notion of "link" from * "interface" and provide a user interface to set the default. */ SCOPE6_LOCK(); if (ifp) { V_sid_default.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = ifp->if_index; V_sid_default.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index; } else { V_sid_default.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = 0; V_sid_default.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = 0; } SCOPE6_UNLOCK(); } int scope6_get_default(struct scope6_id *idlist) { INIT_VNET_INET6(curvnet); SCOPE6_LOCK(); *idlist = V_sid_default; SCOPE6_UNLOCK(); return (0); } u_int32_t scope6_addr2default(struct in6_addr *addr) { INIT_VNET_INET6(curvnet); u_int32_t id; /* * special case: The loopback address should be considered as * link-local, but there's no ambiguity in the syntax. */ if (IN6_IS_ADDR_LOOPBACK(addr)) return (0); /* * XXX: 32-bit read is atomic on all our platforms, is it OK * not to lock here? */ SCOPE6_LOCK(); id = V_sid_default.s6id_list[in6_addrscope(addr)]; SCOPE6_UNLOCK(); return (id); } /* * Validate the specified scope zone ID in the sin6_scope_id field. If the ID * is unspecified (=0), needs to be specified, and the default zone ID can be * used, the default value will be used. * This routine then generates the kernel-internal form: if the address scope * of is interface-local or link-local, embed the interface index in the * address. */ int sa6_embedscope(struct sockaddr_in6 *sin6, int defaultok) { INIT_VNET_NET(curvnet); struct ifnet *ifp; u_int32_t zoneid; if ((zoneid = sin6->sin6_scope_id) == 0 && defaultok) zoneid = scope6_addr2default(&sin6->sin6_addr); if (zoneid != 0 && (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr))) { /* * At this moment, we only check interface-local and * link-local scope IDs, and use interface indices as the * zone IDs assuming a one-to-one mapping between interfaces * and links. */ if (V_if_index < zoneid) return (ENXIO); ifp = ifnet_byindex(zoneid); if (ifp == NULL) /* XXX: this can happen for some OS */ return (ENXIO); /* XXX assignment to 16bit from 32bit variable */ sin6->sin6_addr.s6_addr16[1] = htons(zoneid & 0xffff); sin6->sin6_scope_id = 0; } return 0; } /* * generate standard sockaddr_in6 from embedded form. */ int sa6_recoverscope(struct sockaddr_in6 *sin6) { INIT_VNET_NET(curvnet); char ip6buf[INET6_ADDRSTRLEN]; u_int32_t zoneid; if (sin6->sin6_scope_id != 0) { log(LOG_NOTICE, "sa6_recoverscope: assumption failure (non 0 ID): %s%%%d\n", ip6_sprintf(ip6buf, &sin6->sin6_addr), sin6->sin6_scope_id); /* XXX: proceed anyway... */ } if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr)) { /* * KAME assumption: link id == interface id */ zoneid = ntohs(sin6->sin6_addr.s6_addr16[1]); if (zoneid) { /* sanity check */ if (zoneid < 0 || V_if_index < zoneid) return (ENXIO); if (!ifnet_byindex(zoneid)) return (ENXIO); sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = zoneid; } } return 0; } /* * Determine the appropriate scope zone ID for in6 and ifp. If ret_id is * non NULL, it is set to the zone ID. If the zone ID needs to be embedded * in the in6_addr structure, in6 will be modified. * * ret_id - unnecessary? */ int in6_setscope(struct in6_addr *in6, struct ifnet *ifp, u_int32_t *ret_id) { int scope; u_int32_t zoneid = 0; struct scope6_id *sid; IF_AFDATA_LOCK(ifp); sid = SID(ifp); #ifdef DIAGNOSTIC if (sid == NULL) { /* should not happen */ panic("in6_setscope: scope array is NULL"); /* NOTREACHED */ } #endif /* * special case: the loopback address can only belong to a loopback * interface. */ if (IN6_IS_ADDR_LOOPBACK(in6)) { if (!(ifp->if_flags & IFF_LOOPBACK)) { IF_AFDATA_UNLOCK(ifp); return (EINVAL); } else { if (ret_id != NULL) *ret_id = 0; /* there's no ambiguity */ IF_AFDATA_UNLOCK(ifp); return (0); } } scope = in6_addrscope(in6); SCOPE6_LOCK(); switch (scope) { case IPV6_ADDR_SCOPE_INTFACELOCAL: /* should be interface index */ zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL]; break; case IPV6_ADDR_SCOPE_LINKLOCAL: zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL]; break; case IPV6_ADDR_SCOPE_SITELOCAL: zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_SITELOCAL]; break; case IPV6_ADDR_SCOPE_ORGLOCAL: zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_ORGLOCAL]; break; default: zoneid = 0; /* XXX: treat as global. */ break; } SCOPE6_UNLOCK(); IF_AFDATA_UNLOCK(ifp); if (ret_id != NULL) *ret_id = zoneid; if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6)) in6->s6_addr16[1] = htons(zoneid & 0xffff); /* XXX */ return (0); } /* * Just clear the embedded scope identifier. Return 0 if the original address * is intact; return non 0 if the address is modified. */ int in6_clearscope(struct in6_addr *in6) { int modified = 0; if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6)) { if (in6->s6_addr16[1] != 0) modified = 1; in6->s6_addr16[1] = 0; } return (modified); } Index: head/sys/netinet6/vinet6.h =================================================================== --- head/sys/netinet6/vinet6.h (revision 185087) +++ head/sys/netinet6/vinet6.h (revision 185088) @@ -1,259 +1,261 @@ /*- * Copyright (c) 2006-2008 University of Zagreb * Copyright (c) 2006-2008 FreeBSD Foundation * * This software was developed by the University of Zagreb and the * FreeBSD Foundation under sponsorship by the Stichting NLnet and the * FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NETINET6_VINET6_H_ #define _NETINET6_VINET6_H_ #ifdef VIMAGE #include #include #include #include #include #include #include #include #include #include #include struct vnet_inet6 { struct in6_ifaddr * _in6_ifaddr; u_int _frag6_nfragpackets; u_int _frag6_nfrags; struct ip6q _ip6q; struct route_in6 _ip6_forward_rt; struct in6_addrpolicy _defaultaddrpolicy; TAILQ_HEAD(, addrsel_policyent) _addrsel_policytab; u_int _in6_maxmtu; int _ip6_auto_linklocal; int _rtq_minreallyold6; int _rtq_reallyold6; int _rtq_toomany6; struct ip6stat _ip6stat; struct rip6stat _rip6stat; struct icmp6stat _icmp6stat; int _rtq_timeout6; struct callout _rtq_timer6; struct callout _rtq_mtutimer; struct callout _nd6_slowtimo_ch; struct callout _nd6_timer_ch; struct callout _in6_tmpaddrtimer_ch; int _nd6_inuse; int _nd6_allocated; struct llinfo_nd6 _llinfo_nd6; struct nd_drhead _nd_defrouter; struct nd_prhead _nd_prefix; struct ifnet * _nd6_defifp; int _nd6_defifindex; struct scope6_id _sid_default; TAILQ_HEAD(, dadq) _dadq; int _dad_init; int _icmp6errpps_count; int _icmp6errppslim_last; int _ip6_forwarding; int _ip6_sendredirects; int _ip6_defhlim; int _ip6_defmcasthlim; int _ip6_accept_rtadv; int _ip6_maxfragpackets; int _ip6_maxfrags; int _ip6_log_interval; int _ip6_hdrnestlimit; int _ip6_dad_count; int _ip6_auto_flowlabel; int _ip6_use_deprecated; int _ip6_rr_prune; int _ip6_mcast_pmtu; int _ip6_v6only; int _ip6_keepfaith; int _ip6stealth; time_t _ip6_log_time; + int _nd6_onlink_ns_rfc4861; int _pmtu_expire; int _pmtu_probe; u_long _rip6_sendspace; u_long _rip6_recvspace; int _icmp6_rediraccept; int _icmp6_redirtimeout; int _icmp6errppslim; int _icmp6_nodeinfo; int _udp6_sendspace; int _udp6_recvspace; int _ip6qmaxlen; int _ip6_prefer_tempaddr; int _ip6_forward_srcrt; int _ip6_sourcecheck; int _ip6_sourcecheck_interval; int _ip6_ours_check_algorithm; int _nd6_prune; int _nd6_delay; int _nd6_umaxtries; int _nd6_mmaxtries; int _nd6_useloopback; int _nd6_gctimer; int _nd6_maxndopt; int _nd6_maxnudhint; int _nd6_maxqueuelen; int _nd6_debug; int _nd6_recalc_reachtm_interval; int _dad_ignore_ns; int _dad_maxtry; int _ip6_use_tempaddr; int _ip6_desync_factor; u_int32_t _ip6_temp_preferred_lifetime; u_int32_t _ip6_temp_valid_lifetime; int _ip6_mrouter_ver; int _pim6; u_int _mrt6debug; int _ip6_temp_regen_advance; int _ip6_use_defzone; struct ip6_pktopts _ip6_opts; }; #endif #define INIT_VNET_INET6(vnet) \ INIT_FROM_VNET(vnet, VNET_MOD_INET6, struct vnet_inet6, vnet_inet6) #define VNET_INET6(sym) VSYM(vnet_inet6, sym) /* * Symbol translation macros */ #define V_addrsel_policytab VNET_INET6(addrsel_policytab) #define V_dad_ignore_ns VNET_INET6(dad_ignore_ns) #define V_dad_init VNET_INET6(dad_init) #define V_dad_maxtry VNET_INET6(dad_maxtry) #define V_dadq VNET_INET6(dadq) #define V_defaultaddrpolicy VNET_INET6(defaultaddrpolicy) #define V_frag6_nfragpackets VNET_INET6(frag6_nfragpackets) #define V_frag6_nfrags VNET_INET6(frag6_nfrags) #define V_icmp6_nodeinfo VNET_INET6(icmp6_nodeinfo) #define V_icmp6_rediraccept VNET_INET6(icmp6_rediraccept) #define V_icmp6_redirtimeout VNET_INET6(icmp6_redirtimeout) #define V_icmp6errpps_count VNET_INET6(icmp6errpps_count) #define V_icmp6errppslim VNET_INET6(icmp6errppslim) #define V_icmp6errppslim_last VNET_INET6(icmp6errppslim_last) #define V_icmp6stat VNET_INET6(icmp6stat) #define V_in6_ifaddr VNET_INET6(in6_ifaddr) #define V_in6_maxmtu VNET_INET6(in6_maxmtu) #define V_in6_tmpaddrtimer_ch VNET_INET6(in6_tmpaddrtimer_ch) #define V_ip6_accept_rtadv VNET_INET6(ip6_accept_rtadv) #define V_ip6_auto_flowlabel VNET_INET6(ip6_auto_flowlabel) #define V_ip6_auto_linklocal VNET_INET6(ip6_auto_linklocal) #define V_ip6_dad_count VNET_INET6(ip6_dad_count) #define V_ip6_defhlim VNET_INET6(ip6_defhlim) #define V_ip6_defmcasthlim VNET_INET6(ip6_defmcasthlim) #define V_ip6_desync_factor VNET_INET6(ip6_desync_factor) #define V_ip6_forward_rt VNET_INET6(ip6_forward_rt) #define V_ip6_forward_srcrt VNET_INET6(ip6_forward_srcrt) #define V_ip6_forwarding VNET_INET6(ip6_forwarding) #define V_ip6_hdrnestlimit VNET_INET6(ip6_hdrnestlimit) #define V_ip6_keepfaith VNET_INET6(ip6_keepfaith) #define V_ip6_log_interval VNET_INET6(ip6_log_interval) #define V_ip6_log_time VNET_INET6(ip6_log_time) #define V_ip6_maxfragpackets VNET_INET6(ip6_maxfragpackets) #define V_ip6_maxfrags VNET_INET6(ip6_maxfrags) #define V_ip6_mcast_pmtu VNET_INET6(ip6_mcast_pmtu) #define V_ip6_mrouter_ver VNET_INET6(ip6_mrouter_ver) #define V_ip6_opts VNET_INET6(ip6_opts) #define V_ip6_ours_check_algorithm VNET_INET6(ip6_ours_check_algorithm) #define V_ip6_prefer_tempaddr VNET_INET6(ip6_prefer_tempaddr) #define V_ip6_rr_prune VNET_INET6(ip6_rr_prune) #define V_ip6_sendredirects VNET_INET6(ip6_sendredirects) #define V_ip6_sourcecheck VNET_INET6(ip6_sourcecheck) #define V_ip6_sourcecheck_interval VNET_INET6(ip6_sourcecheck_interval) #define V_ip6_temp_preferred_lifetime VNET_INET6(ip6_temp_preferred_lifetime) #define V_ip6_temp_regen_advance VNET_INET6(ip6_temp_regen_advance) #define V_ip6_temp_valid_lifetime VNET_INET6(ip6_temp_valid_lifetime) #define V_ip6_use_defzone VNET_INET6(ip6_use_defzone) #define V_ip6_use_deprecated VNET_INET6(ip6_use_deprecated) #define V_ip6_use_tempaddr VNET_INET6(ip6_use_tempaddr) #define V_ip6_v6only VNET_INET6(ip6_v6only) #define V_ip6q VNET_INET6(ip6q) #define V_ip6qmaxlen VNET_INET6(ip6qmaxlen) #define V_ip6stat VNET_INET6(ip6stat) #define V_ip6stealth VNET_INET6(ip6stealth) #define V_llinfo_nd6 VNET_INET6(llinfo_nd6) #define V_mrt6debug VNET_INET6(mrt6debug) #define V_nd6_allocated VNET_INET6(nd6_allocated) #define V_nd6_debug VNET_INET6(nd6_debug) #define V_nd6_defifindex VNET_INET6(nd6_defifindex) #define V_nd6_defifp VNET_INET6(nd6_defifp) #define V_nd6_delay VNET_INET6(nd6_delay) #define V_nd6_gctimer VNET_INET6(nd6_gctimer) #define V_nd6_inuse VNET_INET6(nd6_inuse) #define V_nd6_maxndopt VNET_INET6(nd6_maxndopt) #define V_nd6_maxnudhint VNET_INET6(nd6_maxnudhint) #define V_nd6_maxqueuelen VNET_INET6(nd6_maxqueuelen) #define V_nd6_mmaxtries VNET_INET6(nd6_mmaxtries) +#define V_nd6_onlink_ns_rfc4861 VNET_INET6(nd6_onlink_ns_rfc4861) #define V_nd6_prune VNET_INET6(nd6_prune) #define V_nd6_recalc_reachtm_interval VNET_INET6(nd6_recalc_reachtm_interval) #define V_nd6_slowtimo_ch VNET_INET6(nd6_slowtimo_ch) #define V_nd6_timer_ch VNET_INET6(nd6_timer_ch) #define V_nd6_umaxtries VNET_INET6(nd6_umaxtries) #define V_nd6_useloopback VNET_INET6(nd6_useloopback) #define V_nd_defrouter VNET_INET6(nd_defrouter) #define V_nd_prefix VNET_INET6(nd_prefix) #define V_pim6 VNET_INET6(pim6) #define V_pmtu_expire VNET_INET6(pmtu_expire) #define V_pmtu_probe VNET_INET6(pmtu_probe) #define V_rip6_recvspace VNET_INET6(rip6_recvspace) #define V_rip6_sendspace VNET_INET6(rip6_sendspace) #define V_rip6stat VNET_INET6(rip6stat) #define V_rtq_minreallyold6 VNET_INET6(rtq_minreallyold6) #define V_rtq_mtutimer VNET_INET6(rtq_mtutimer) #define V_rtq_reallyold6 VNET_INET6(rtq_reallyold6) #define V_rtq_timeout6 VNET_INET6(rtq_timeout6) #define V_rtq_timer6 VNET_INET6(rtq_timer6) #define V_rtq_toomany6 VNET_INET6(rtq_toomany6) #define V_sid_default VNET_INET6(sid_default) #define V_udp6_recvspace VNET_INET6(udp6_recvspace) #define V_udp6_sendspace VNET_INET6(udp6_sendspace) #endif /* !_NETINET6_VINET6_H_ */ Index: head/sys/netipsec/ipsec.c =================================================================== --- head/sys/netipsec/ipsec.c (revision 185087) +++ head/sys/netipsec/ipsec.c (revision 185088) @@ -1,2008 +1,2043 @@ /* $FreeBSD$ */ /* $KAME: ipsec.c,v 1.103 2001/05/24 07:14:18 sakane Exp $ */ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPsec controller part. */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #include #ifdef INET6 #include #endif #include #include #include /*XXX*/ #include #include #include #include #include #include #include -#ifdef IPSEC_DEBUG -int ipsec_debug = 1; -#else -int ipsec_debug = 0; -#endif - +#ifdef VIMAGE_GLOBALS /* NB: name changed so netstat doesn't use it */ struct ipsecstat ipsec4stat; -int ip4_ah_offsetmask = 0; /* maybe IP_DF? */ -int ip4_ipsec_dfbit = 0; /* DF bit on encap. 0: clear 1: set 2: copy */ -int ip4_esp_trans_deflev = IPSEC_LEVEL_USE; -int ip4_esp_net_deflev = IPSEC_LEVEL_USE; -int ip4_ah_trans_deflev = IPSEC_LEVEL_USE; -int ip4_ah_net_deflev = IPSEC_LEVEL_USE; struct secpolicy ip4_def_policy; -int ip4_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ -int ip4_esp_randpad = -1; +int ipsec_debug; +int ip4_ah_offsetmask; +int ip4_ipsec_dfbit; +int ip4_esp_trans_deflev; +int ip4_esp_net_deflev; +int ip4_ah_trans_deflev; +int ip4_ah_net_deflev; +int ip4_ipsec_ecn; +int ip4_esp_randpad; /* * Crypto support requirements: * * 1 require hardware support * -1 require software support * 0 take anything */ -int crypto_support = CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE; +int crypto_support; +#endif /* VIMAGE_GLOBALS */ SYSCTL_DECL(_net_inet_ipsec); /* net.inet.ipsec */ SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_POLICY, def_policy, CTLFLAG_RW, ip4_def_policy.policy, 0, "IPsec default policy."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev, CTLFLAG_RW, ip4_esp_trans_deflev, 0, "Default ESP transport mode level"); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev, CTLFLAG_RW, ip4_esp_net_deflev, 0, "Default ESP tunnel mode level."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev, CTLFLAG_RW, ip4_ah_trans_deflev, 0, "AH transfer mode default level."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev, CTLFLAG_RW, ip4_ah_net_deflev, 0, "AH tunnel mode default level."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_AH_CLEARTOS, ah_cleartos, CTLFLAG_RW, ah_cleartos, 0, "If set clear type-of-service field when doing AH computation."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_AH_OFFSETMASK, ah_offsetmask, CTLFLAG_RW, ip4_ah_offsetmask, 0, "If not set clear offset field mask when doing AH computation."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DFBIT, dfbit, CTLFLAG_RW, ip4_ipsec_dfbit, 0, "Do not fragment bit on encap."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_ECN, ecn, CTLFLAG_RW, ip4_ipsec_ecn, 0, "Explicit Congestion Notification handling."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEBUG, debug, CTLFLAG_RW, ipsec_debug, 0, "Enable IPsec debugging output when set."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, OID_AUTO, crypto_support, CTLFLAG_RW, crypto_support,0, "Crypto driver selection."); SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ipsec, OID_AUTO, ipsecstats, CTLFLAG_RD, ipsec4stat, ipsecstat, "IPsec IPv4 statistics."); #ifdef REGRESSION +#ifdef VIMAGE_GLOBALS +int ipsec_replay; +int ipsec_integrity; +#endif /* * When set to 1, IPsec will send packets with the same sequence number. * This allows to verify if the other side has proper replay attacks detection. */ -int ipsec_replay = 0; SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, OID_AUTO, test_replay, CTLFLAG_RW, ipsec_replay, 0, "Emulate replay attack"); /* * When set 1, IPsec will send packets with corrupted HMAC. * This allows to verify if the other side properly detects modified packets. */ -int ipsec_integrity = 0; SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, OID_AUTO, test_integrity, CTLFLAG_RW, ipsec_integrity, 0, "Emulate man-in-the-middle attack"); #endif #ifdef INET6 +#ifdef VIMAGE_GLOBALS struct ipsecstat ipsec6stat; -int ip6_esp_trans_deflev = IPSEC_LEVEL_USE; -int ip6_esp_net_deflev = IPSEC_LEVEL_USE; -int ip6_ah_trans_deflev = IPSEC_LEVEL_USE; -int ip6_ah_net_deflev = IPSEC_LEVEL_USE; -int ip6_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ +int ip6_esp_trans_deflev; +int ip6_esp_net_deflev; +int ip6_ah_trans_deflev; +int ip6_ah_net_deflev; +int ip6_ipsec_ecn; +#endif SYSCTL_DECL(_net_inet6_ipsec6); /* net.inet6.ipsec6 */ #ifdef COMPAT_KAME SYSCTL_OID(_net_inet6_ipsec6, IPSECCTL_STATS, stats, CTLFLAG_RD, 0, 0, compat_ipsecstats_sysctl, "S", "IPsec IPv6 statistics."); #endif /* COMPAT_KAME */ SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_POLICY, def_policy, CTLFLAG_RW, ip4_def_policy.policy, 0, "IPsec default policy."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev, CTLFLAG_RW, ip6_esp_trans_deflev, 0, "Default ESP transport mode level."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev, CTLFLAG_RW, ip6_esp_net_deflev, 0, "Default ESP tunnel mode level."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev, CTLFLAG_RW, ip6_ah_trans_deflev, 0, "AH transfer mode default level."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev, CTLFLAG_RW, ip6_ah_net_deflev, 0, "AH tunnel mode default level."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_ECN, ecn, CTLFLAG_RW, ip6_ipsec_ecn, 0, "Explicit Congestion Notification handling."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEBUG, debug, CTLFLAG_RW, ipsec_debug, 0, "Enable IPsec debugging output when set."); SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_STATS, ipsecstats, CTLFLAG_RD, ipsec6stat, ipsecstat, "IPsec IPv6 statistics."); #endif /* INET6 */ static int ipsec4_setspidx_inpcb __P((struct mbuf *, struct inpcb *pcb)); #ifdef INET6 static int ipsec6_setspidx_in6pcb __P((struct mbuf *, struct in6pcb *pcb)); #endif static int ipsec_setspidx __P((struct mbuf *, struct secpolicyindex *, int)); static void ipsec4_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int)); static int ipsec4_setspidx_ipaddr __P((struct mbuf *, struct secpolicyindex *)); #ifdef INET6 static void ipsec6_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int)); static int ipsec6_setspidx_ipaddr __P((struct mbuf *, struct secpolicyindex *)); #endif static void ipsec_delpcbpolicy __P((struct inpcbpolicy *)); static struct secpolicy *ipsec_deepcopy_policy __P((struct secpolicy *src)); static int ipsec_set_policy __P((struct secpolicy **pcb_sp, int optname, caddr_t request, size_t len, struct ucred *cred)); static int ipsec_get_policy __P((struct secpolicy *pcb_sp, struct mbuf **mp)); static void vshiftl __P((unsigned char *, int, int)); static size_t ipsec_hdrsiz __P((struct secpolicy *)); MALLOC_DEFINE(M_IPSEC_INPCB, "inpcbpolicy", "inpcb-resident ipsec policy"); + +void +ipsec_init(void) +{ + INIT_VNET_IPSEC(curvnet); + +#ifdef IPSEC_DEBUG + V_ipsec_debug = 1; +#else + V_ipsec_debug = 0; +#endif + + V_ip4_ah_offsetmask = 0; /* maybe IP_DF? */ + V_ip4_ipsec_dfbit = 0; /* DF bit on encap. 0: clear 1: set 2: copy */ + V_ip4_esp_trans_deflev = IPSEC_LEVEL_USE; + V_ip4_esp_net_deflev = IPSEC_LEVEL_USE; + V_ip4_ah_trans_deflev = IPSEC_LEVEL_USE; + V_ip4_ah_net_deflev = IPSEC_LEVEL_USE; + V_ip4_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ + V_ip4_esp_randpad = -1; + + V_crypto_support = CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE; + +#ifdef REGRESSION + V_ipsec_replay = 0; + V_ipsec_integrity = 0; +#endif + + V_ip6_esp_trans_deflev = IPSEC_LEVEL_USE; + V_ip6_esp_net_deflev = IPSEC_LEVEL_USE; + V_ip6_ah_trans_deflev = IPSEC_LEVEL_USE; + V_ip6_ah_net_deflev = IPSEC_LEVEL_USE; + V_ip6_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ +} /* * Return a held reference to the default SP. */ static struct secpolicy * key_allocsp_default(const char* where, int tag) { INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP key_allocsp_default from %s:%u\n", where, tag)); sp = &V_ip4_def_policy; if (sp->policy != IPSEC_POLICY_DISCARD && sp->policy != IPSEC_POLICY_NONE) { ipseclog((LOG_INFO, "fixed system default policy: %d->%d\n", sp->policy, IPSEC_POLICY_NONE)); sp->policy = IPSEC_POLICY_NONE; } key_addref(sp); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP key_allocsp_default returns SP:%p (%u)\n", sp, sp->refcnt)); return sp; } #define KEY_ALLOCSP_DEFAULT() \ key_allocsp_default(__FILE__, __LINE__) /* * For OUTBOUND packet having a socket. Searching SPD for packet, * and return a pointer to SP. * OUT: NULL: no apropreate SP found, the following value is set to error. * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. * others : error occured. * others: a pointer to SP * * NOTE: IPv6 mapped adddress concern is implemented here. */ struct secpolicy * ipsec_getpolicy(struct tdb_ident *tdbi, u_int dir) { struct secpolicy *sp; IPSEC_ASSERT(tdbi != NULL, ("null tdbi")); IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND, ("invalid direction %u", dir)); sp = KEY_ALLOCSP2(tdbi->spi, &tdbi->dst, tdbi->proto, dir); if (sp == NULL) /*XXX????*/ sp = KEY_ALLOCSP_DEFAULT(); IPSEC_ASSERT(sp != NULL, ("null SP")); return sp; } /* * For OUTBOUND packet having a socket. Searching SPD for packet, * and return a pointer to SP. * OUT: NULL: no apropreate SP found, the following value is set to error. * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. * others : error occured. * others: a pointer to SP * * NOTE: IPv6 mapped adddress concern is implemented here. */ struct secpolicy * ipsec_getpolicybysock(m, dir, inp, error) struct mbuf *m; u_int dir; struct inpcb *inp; int *error; { INIT_VNET_IPSEC(curvnet); struct inpcbpolicy *pcbsp = NULL; struct secpolicy *currsp = NULL; /* policy on socket */ struct secpolicy *sp; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(inp != NULL, ("null inpcb")); IPSEC_ASSERT(error != NULL, ("null error")); IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND, ("invalid direction %u", dir)); /* set spidx in pcb */ if (inp->inp_vflag & INP_IPV6PROTO) { #ifdef INET6 *error = ipsec6_setspidx_in6pcb(m, inp); pcbsp = inp->in6p_sp; #else *error = EINVAL; /* should not happen */ #endif } else { *error = ipsec4_setspidx_inpcb(m, inp); pcbsp = inp->inp_sp; } if (*error) return NULL; IPSEC_ASSERT(pcbsp != NULL, ("null pcbsp")); switch (dir) { case IPSEC_DIR_INBOUND: currsp = pcbsp->sp_in; break; case IPSEC_DIR_OUTBOUND: currsp = pcbsp->sp_out; break; } IPSEC_ASSERT(currsp != NULL, ("null currsp")); if (pcbsp->priv) { /* when privilieged socket */ switch (currsp->policy) { case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_IPSEC: key_addref(currsp); sp = currsp; break; case IPSEC_POLICY_ENTRUST: /* look for a policy in SPD */ sp = KEY_ALLOCSP(&currsp->spidx, dir); if (sp == NULL) /* no SP found */ sp = KEY_ALLOCSP_DEFAULT(); break; default: ipseclog((LOG_ERR, "%s: Invalid policy for PCB %d\n", __func__, currsp->policy)); *error = EINVAL; return NULL; } } else { /* unpriv, SPD has policy */ sp = KEY_ALLOCSP(&currsp->spidx, dir); if (sp == NULL) { /* no SP found */ switch (currsp->policy) { case IPSEC_POLICY_BYPASS: ipseclog((LOG_ERR, "%s: Illegal policy for " "non-priviliged defined %d\n", __func__, currsp->policy)); *error = EINVAL; return NULL; case IPSEC_POLICY_ENTRUST: sp = KEY_ALLOCSP_DEFAULT(); break; case IPSEC_POLICY_IPSEC: key_addref(currsp); sp = currsp; break; default: ipseclog((LOG_ERR, "%s: Invalid policy for " "PCB %d\n", __func__, currsp->policy)); *error = EINVAL; return NULL; } } } IPSEC_ASSERT(sp != NULL, ("null SP (priv %u policy %u", pcbsp->priv, currsp->policy)); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s (priv %u policy %u) allocate SP:%p (refcnt %u)\n", __func__, pcbsp->priv, currsp->policy, sp, sp->refcnt)); return sp; } /* * For FORWADING packet or OUTBOUND without a socket. Searching SPD for packet, * and return a pointer to SP. * OUT: positive: a pointer to the entry for security policy leaf matched. * NULL: no apropreate SP found, the following value is set to error. * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. * others : error occured. */ struct secpolicy * ipsec_getpolicybyaddr(m, dir, flag, error) struct mbuf *m; u_int dir; int flag; int *error; { INIT_VNET_IPSEC(curvnet); struct secpolicyindex spidx; struct secpolicy *sp; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(error != NULL, ("null error")); IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND, ("invalid direction %u", dir)); sp = NULL; if (key_havesp(dir)) { /* Make an index to look for a policy. */ *error = ipsec_setspidx(m, &spidx, (flag & IP_FORWARDING) ? 0 : 1); if (*error != 0) { DPRINTF(("%s: setpidx failed, dir %u flag %u\n", __func__, dir, flag)); return NULL; } spidx.dir = dir; sp = KEY_ALLOCSP(&spidx, dir); } if (sp == NULL) /* no SP found, use system default */ sp = KEY_ALLOCSP_DEFAULT(); IPSEC_ASSERT(sp != NULL, ("null SP")); return sp; } struct secpolicy * ipsec4_checkpolicy(m, dir, flag, error, inp) struct mbuf *m; u_int dir, flag; int *error; struct inpcb *inp; { INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; *error = 0; if (inp == NULL) sp = ipsec_getpolicybyaddr(m, dir, flag, error); else sp = ipsec_getpolicybysock(m, dir, inp, error); if (sp == NULL) { IPSEC_ASSERT(*error != 0, ("getpolicy failed w/o error")); V_ipsec4stat.ips_out_inval++; return NULL; } IPSEC_ASSERT(*error == 0, ("sp w/ error set to %u", *error)); switch (sp->policy) { case IPSEC_POLICY_ENTRUST: default: printf("%s: invalid policy %u\n", __func__, sp->policy); /* fall thru... */ case IPSEC_POLICY_DISCARD: V_ipsec4stat.ips_out_polvio++; *error = -EINVAL; /* packet is discarded by caller */ break; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: KEY_FREESP(&sp); sp = NULL; /* NB: force NULL result */ break; case IPSEC_POLICY_IPSEC: if (sp->req == NULL) /* acquire an SA */ *error = key_spdacquire(sp); break; } if (*error != 0) { KEY_FREESP(&sp); sp = NULL; } return sp; } static int ipsec4_setspidx_inpcb(m, pcb) struct mbuf *m; struct inpcb *pcb; { int error; IPSEC_ASSERT(pcb != NULL, ("null pcb")); IPSEC_ASSERT(pcb->inp_sp != NULL, ("null inp_sp")); IPSEC_ASSERT(pcb->inp_sp->sp_out != NULL && pcb->inp_sp->sp_in != NULL, ("null sp_in || sp_out")); error = ipsec_setspidx(m, &pcb->inp_sp->sp_in->spidx, 1); if (error == 0) { pcb->inp_sp->sp_in->spidx.dir = IPSEC_DIR_INBOUND; pcb->inp_sp->sp_out->spidx = pcb->inp_sp->sp_in->spidx; pcb->inp_sp->sp_out->spidx.dir = IPSEC_DIR_OUTBOUND; } else { bzero(&pcb->inp_sp->sp_in->spidx, sizeof (pcb->inp_sp->sp_in->spidx)); bzero(&pcb->inp_sp->sp_out->spidx, sizeof (pcb->inp_sp->sp_in->spidx)); } return error; } #ifdef INET6 static int ipsec6_setspidx_in6pcb(m, pcb) struct mbuf *m; struct in6pcb *pcb; { //INIT_VNET_IPSEC(curvnet); struct secpolicyindex *spidx; int error; IPSEC_ASSERT(pcb != NULL, ("null pcb")); IPSEC_ASSERT(pcb->in6p_sp != NULL, ("null inp_sp")); IPSEC_ASSERT(pcb->in6p_sp->sp_out != NULL && pcb->in6p_sp->sp_in != NULL, ("null sp_in || sp_out")); bzero(&pcb->in6p_sp->sp_in->spidx, sizeof(*spidx)); bzero(&pcb->in6p_sp->sp_out->spidx, sizeof(*spidx)); spidx = &pcb->in6p_sp->sp_in->spidx; error = ipsec_setspidx(m, spidx, 1); if (error) goto bad; spidx->dir = IPSEC_DIR_INBOUND; spidx = &pcb->in6p_sp->sp_out->spidx; error = ipsec_setspidx(m, spidx, 1); if (error) goto bad; spidx->dir = IPSEC_DIR_OUTBOUND; return 0; bad: bzero(&pcb->in6p_sp->sp_in->spidx, sizeof(*spidx)); bzero(&pcb->in6p_sp->sp_out->spidx, sizeof(*spidx)); return error; } #endif /* * configure security policy index (src/dst/proto/sport/dport) * by looking at the content of mbuf. * the caller is responsible for error recovery (like clearing up spidx). */ static int ipsec_setspidx(m, spidx, needport) struct mbuf *m; struct secpolicyindex *spidx; int needport; { INIT_VNET_IPSEC(curvnet); struct ip *ip = NULL; struct ip ipbuf; u_int v; struct mbuf *n; int len; int error; IPSEC_ASSERT(m != NULL, ("null mbuf")); /* * validate m->m_pkthdr.len. we see incorrect length if we * mistakenly call this function with inconsistent mbuf chain * (like 4.4BSD tcp/udp processing). XXX should we panic here? */ len = 0; for (n = m; n; n = n->m_next) len += n->m_len; if (m->m_pkthdr.len != len) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: pkthdr len(%d) mismatch (%d), ignored.\n", __func__, len, m->m_pkthdr.len)); return EINVAL; } if (m->m_pkthdr.len < sizeof(struct ip)) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: pkthdr len(%d) too small (v4), ignored.\n", __func__, m->m_pkthdr.len)); return EINVAL; } if (m->m_len >= sizeof(*ip)) ip = mtod(m, struct ip *); else { m_copydata(m, 0, sizeof(ipbuf), (caddr_t)&ipbuf); ip = &ipbuf; } #ifdef _IP_VHL v = _IP_VHL_V(ip->ip_vhl); #else v = ip->ip_v; #endif switch (v) { case 4: error = ipsec4_setspidx_ipaddr(m, spidx); if (error) return error; ipsec4_get_ulp(m, spidx, needport); return 0; #ifdef INET6 case 6: if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: pkthdr len(%d) too small (v6), " "ignored\n", __func__, m->m_pkthdr.len)); return EINVAL; } error = ipsec6_setspidx_ipaddr(m, spidx); if (error) return error; ipsec6_get_ulp(m, spidx, needport); return 0; #endif default: KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: " "unknown IP version %u, ignored.\n", __func__, v)); return EINVAL; } } static void ipsec4_get_ulp(struct mbuf *m, struct secpolicyindex *spidx, int needport) { u_int8_t nxt; int off; /* sanity check */ IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(m->m_pkthdr.len >= sizeof(struct ip),("packet too short")); /* NB: ip_input() flips it into host endian XXX need more checking */ if (m->m_len < sizeof (struct ip)) { struct ip *ip = mtod(m, struct ip *); if (ip->ip_off & (IP_MF | IP_OFFMASK)) goto done; #ifdef _IP_VHL off = _IP_VHL_HL(ip->ip_vhl) << 2; #else off = ip->ip_hl << 2; #endif nxt = ip->ip_p; } else { struct ip ih; m_copydata(m, 0, sizeof (struct ip), (caddr_t) &ih); if (ih.ip_off & (IP_MF | IP_OFFMASK)) goto done; #ifdef _IP_VHL off = _IP_VHL_HL(ih.ip_vhl) << 2; #else off = ih.ip_hl << 2; #endif nxt = ih.ip_p; } while (off < m->m_pkthdr.len) { struct ip6_ext ip6e; struct tcphdr th; struct udphdr uh; switch (nxt) { case IPPROTO_TCP: spidx->ul_proto = nxt; if (!needport) goto done_proto; if (off + sizeof(struct tcphdr) > m->m_pkthdr.len) goto done; m_copydata(m, off, sizeof (th), (caddr_t) &th); spidx->src.sin.sin_port = th.th_sport; spidx->dst.sin.sin_port = th.th_dport; return; case IPPROTO_UDP: spidx->ul_proto = nxt; if (!needport) goto done_proto; if (off + sizeof(struct udphdr) > m->m_pkthdr.len) goto done; m_copydata(m, off, sizeof (uh), (caddr_t) &uh); spidx->src.sin.sin_port = uh.uh_sport; spidx->dst.sin.sin_port = uh.uh_dport; return; case IPPROTO_AH: if (off + sizeof(ip6e) > m->m_pkthdr.len) goto done; /* XXX sigh, this works but is totally bogus */ m_copydata(m, off, sizeof(ip6e), (caddr_t) &ip6e); off += (ip6e.ip6e_len + 2) << 2; nxt = ip6e.ip6e_nxt; break; case IPPROTO_ICMP: default: /* XXX intermediate headers??? */ spidx->ul_proto = nxt; goto done_proto; } } done: spidx->ul_proto = IPSEC_ULPROTO_ANY; done_proto: spidx->src.sin.sin_port = IPSEC_PORT_ANY; spidx->dst.sin.sin_port = IPSEC_PORT_ANY; } /* assumes that m is sane */ static int ipsec4_setspidx_ipaddr(struct mbuf *m, struct secpolicyindex *spidx) { static const struct sockaddr_in template = { sizeof (struct sockaddr_in), AF_INET, 0, { 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 } }; spidx->src.sin = template; spidx->dst.sin = template; if (m->m_len < sizeof (struct ip)) { m_copydata(m, offsetof(struct ip, ip_src), sizeof (struct in_addr), (caddr_t) &spidx->src.sin.sin_addr); m_copydata(m, offsetof(struct ip, ip_dst), sizeof (struct in_addr), (caddr_t) &spidx->dst.sin.sin_addr); } else { struct ip *ip = mtod(m, struct ip *); spidx->src.sin.sin_addr = ip->ip_src; spidx->dst.sin.sin_addr = ip->ip_dst; } spidx->prefs = sizeof(struct in_addr) << 3; spidx->prefd = sizeof(struct in_addr) << 3; return 0; } #ifdef INET6 static void ipsec6_get_ulp(m, spidx, needport) struct mbuf *m; struct secpolicyindex *spidx; int needport; { INIT_VNET_IPSEC(curvnet); int off, nxt; struct tcphdr th; struct udphdr uh; struct icmp6_hdr ih; /* sanity check */ if (m == NULL) panic("%s: NULL pointer was passed.\n", __func__); KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s:\n", __func__); kdebug_mbuf(m)); /* set default */ spidx->ul_proto = IPSEC_ULPROTO_ANY; ((struct sockaddr_in6 *)&spidx->src)->sin6_port = IPSEC_PORT_ANY; ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = IPSEC_PORT_ANY; nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off < 0 || m->m_pkthdr.len < off) return; switch (nxt) { case IPPROTO_TCP: spidx->ul_proto = nxt; if (!needport) break; if (off + sizeof(struct tcphdr) > m->m_pkthdr.len) break; m_copydata(m, off, sizeof(th), (caddr_t)&th); ((struct sockaddr_in6 *)&spidx->src)->sin6_port = th.th_sport; ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = th.th_dport; break; case IPPROTO_UDP: spidx->ul_proto = nxt; if (!needport) break; if (off + sizeof(struct udphdr) > m->m_pkthdr.len) break; m_copydata(m, off, sizeof(uh), (caddr_t)&uh); ((struct sockaddr_in6 *)&spidx->src)->sin6_port = uh.uh_sport; ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = uh.uh_dport; break; case IPPROTO_ICMPV6: spidx->ul_proto = nxt; if (off + sizeof(struct icmp6_hdr) > m->m_pkthdr.len) break; m_copydata(m, off, sizeof(ih), (caddr_t)&ih); ((struct sockaddr_in6 *)&spidx->src)->sin6_port = htons((uint16_t)ih.icmp6_type); ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = htons((uint16_t)ih.icmp6_code); break; default: /* XXX intermediate headers??? */ spidx->ul_proto = nxt; break; } } /* assumes that m is sane */ static int ipsec6_setspidx_ipaddr(m, spidx) struct mbuf *m; struct secpolicyindex *spidx; { struct ip6_hdr *ip6 = NULL; struct ip6_hdr ip6buf; struct sockaddr_in6 *sin6; if (m->m_len >= sizeof(*ip6)) ip6 = mtod(m, struct ip6_hdr *); else { m_copydata(m, 0, sizeof(ip6buf), (caddr_t)&ip6buf); ip6 = &ip6buf; } sin6 = (struct sockaddr_in6 *)&spidx->src; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); bcopy(&ip6->ip6_src, &sin6->sin6_addr, sizeof(ip6->ip6_src)); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]); } spidx->prefs = sizeof(struct in6_addr) << 3; sin6 = (struct sockaddr_in6 *)&spidx->dst; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(ip6->ip6_dst)); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) { sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]); } spidx->prefd = sizeof(struct in6_addr) << 3; return 0; } #endif static void ipsec_delpcbpolicy(p) struct inpcbpolicy *p; { free(p, M_IPSEC_INPCB); } /* initialize policy in PCB */ int ipsec_init_policy(so, pcb_sp) struct socket *so; struct inpcbpolicy **pcb_sp; { INIT_VNET_IPSEC(curvnet); struct inpcbpolicy *new; /* sanity check. */ if (so == NULL || pcb_sp == NULL) panic("%s: NULL pointer was passed.\n", __func__); new = (struct inpcbpolicy *) malloc(sizeof(struct inpcbpolicy), M_IPSEC_INPCB, M_NOWAIT|M_ZERO); if (new == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return ENOBUFS; } new->priv = IPSEC_IS_PRIVILEGED_SO(so); if ((new->sp_in = KEY_NEWSP()) == NULL) { ipsec_delpcbpolicy(new); return ENOBUFS; } new->sp_in->state = IPSEC_SPSTATE_ALIVE; new->sp_in->policy = IPSEC_POLICY_ENTRUST; if ((new->sp_out = KEY_NEWSP()) == NULL) { KEY_FREESP(&new->sp_in); ipsec_delpcbpolicy(new); return ENOBUFS; } new->sp_out->state = IPSEC_SPSTATE_ALIVE; new->sp_out->policy = IPSEC_POLICY_ENTRUST; *pcb_sp = new; return 0; } /* copy old ipsec policy into new */ int ipsec_copy_policy(old, new) struct inpcbpolicy *old, *new; { struct secpolicy *sp; sp = ipsec_deepcopy_policy(old->sp_in); if (sp) { KEY_FREESP(&new->sp_in); new->sp_in = sp; } else return ENOBUFS; sp = ipsec_deepcopy_policy(old->sp_out); if (sp) { KEY_FREESP(&new->sp_out); new->sp_out = sp; } else return ENOBUFS; new->priv = old->priv; return 0; } struct ipsecrequest * ipsec_newisr(void) { struct ipsecrequest *p; p = malloc(sizeof(struct ipsecrequest), M_IPSEC_SR, M_NOWAIT|M_ZERO); if (p != NULL) IPSECREQUEST_LOCK_INIT(p); return p; } void ipsec_delisr(struct ipsecrequest *p) { IPSECREQUEST_LOCK_DESTROY(p); free(p, M_IPSEC_SR); } /* deep-copy a policy in PCB */ static struct secpolicy * ipsec_deepcopy_policy(src) struct secpolicy *src; { struct ipsecrequest *newchain = NULL; struct ipsecrequest *p; struct ipsecrequest **q; struct ipsecrequest *r; struct secpolicy *dst; if (src == NULL) return NULL; dst = KEY_NEWSP(); if (dst == NULL) return NULL; /* * deep-copy IPsec request chain. This is required since struct * ipsecrequest is not reference counted. */ q = &newchain; for (p = src->req; p; p = p->next) { *q = ipsec_newisr(); if (*q == NULL) goto fail; (*q)->saidx.proto = p->saidx.proto; (*q)->saidx.mode = p->saidx.mode; (*q)->level = p->level; (*q)->saidx.reqid = p->saidx.reqid; bcopy(&p->saidx.src, &(*q)->saidx.src, sizeof((*q)->saidx.src)); bcopy(&p->saidx.dst, &(*q)->saidx.dst, sizeof((*q)->saidx.dst)); (*q)->sp = dst; q = &((*q)->next); } dst->req = newchain; dst->state = src->state; dst->policy = src->policy; /* do not touch the refcnt fields */ return dst; fail: for (p = newchain; p; p = r) { r = p->next; ipsec_delisr(p); p = NULL; } return NULL; } /* set policy and ipsec request if present. */ static int ipsec_set_policy(pcb_sp, optname, request, len, cred) struct secpolicy **pcb_sp; int optname; caddr_t request; size_t len; struct ucred *cred; { INIT_VNET_IPSEC(curvnet); struct sadb_x_policy *xpl; struct secpolicy *newsp = NULL; int error; /* sanity check. */ if (pcb_sp == NULL || *pcb_sp == NULL || request == NULL) return EINVAL; if (len < sizeof(*xpl)) return EINVAL; xpl = (struct sadb_x_policy *)request; KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: passed policy\n", __func__); kdebug_sadb_x_policy((struct sadb_ext *)xpl)); /* check policy type */ /* ipsec_set_policy() accepts IPSEC, ENTRUST and BYPASS. */ if (xpl->sadb_x_policy_type == IPSEC_POLICY_DISCARD || xpl->sadb_x_policy_type == IPSEC_POLICY_NONE) return EINVAL; /* check privileged socket */ if (cred != NULL && xpl->sadb_x_policy_type == IPSEC_POLICY_BYPASS) { error = priv_check_cred(cred, PRIV_NETINET_IPSEC, 0); if (error) return EACCES; } /* allocation new SP entry */ if ((newsp = key_msg2sp(xpl, len, &error)) == NULL) return error; newsp->state = IPSEC_SPSTATE_ALIVE; /* clear old SP and set new SP */ KEY_FREESP(pcb_sp); *pcb_sp = newsp; KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: new policy\n", __func__); kdebug_secpolicy(newsp)); return 0; } static int ipsec_get_policy(pcb_sp, mp) struct secpolicy *pcb_sp; struct mbuf **mp; { INIT_VNET_IPSEC(curvnet); /* sanity check. */ if (pcb_sp == NULL || mp == NULL) return EINVAL; *mp = key_sp2msg(pcb_sp); if (!*mp) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return ENOBUFS; } (*mp)->m_type = MT_DATA; KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s:\n", __func__); kdebug_mbuf(*mp)); return 0; } int ipsec4_set_policy(inp, optname, request, len, cred) struct inpcb *inp; int optname; caddr_t request; size_t len; struct ucred *cred; { INIT_VNET_IPSEC(curvnet); struct sadb_x_policy *xpl; struct secpolicy **pcb_sp; /* sanity check. */ if (inp == NULL || request == NULL) return EINVAL; if (len < sizeof(*xpl)) return EINVAL; xpl = (struct sadb_x_policy *)request; /* select direction */ switch (xpl->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: pcb_sp = &inp->inp_sp->sp_in; break; case IPSEC_DIR_OUTBOUND: pcb_sp = &inp->inp_sp->sp_out; break; default: ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__, xpl->sadb_x_policy_dir)); return EINVAL; } return ipsec_set_policy(pcb_sp, optname, request, len, cred); } int ipsec4_get_policy(inp, request, len, mp) struct inpcb *inp; caddr_t request; size_t len; struct mbuf **mp; { INIT_VNET_IPSEC(curvnet); struct sadb_x_policy *xpl; struct secpolicy *pcb_sp; /* sanity check. */ if (inp == NULL || request == NULL || mp == NULL) return EINVAL; IPSEC_ASSERT(inp->inp_sp != NULL, ("null inp_sp")); if (len < sizeof(*xpl)) return EINVAL; xpl = (struct sadb_x_policy *)request; /* select direction */ switch (xpl->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: pcb_sp = inp->inp_sp->sp_in; break; case IPSEC_DIR_OUTBOUND: pcb_sp = inp->inp_sp->sp_out; break; default: ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__, xpl->sadb_x_policy_dir)); return EINVAL; } return ipsec_get_policy(pcb_sp, mp); } /* delete policy in PCB */ int ipsec4_delete_pcbpolicy(inp) struct inpcb *inp; { IPSEC_ASSERT(inp != NULL, ("null inp")); if (inp->inp_sp == NULL) return 0; if (inp->inp_sp->sp_in != NULL) KEY_FREESP(&inp->inp_sp->sp_in); if (inp->inp_sp->sp_out != NULL) KEY_FREESP(&inp->inp_sp->sp_out); ipsec_delpcbpolicy(inp->inp_sp); inp->inp_sp = NULL; return 0; } #ifdef INET6 int ipsec6_set_policy(in6p, optname, request, len, cred) struct in6pcb *in6p; int optname; caddr_t request; size_t len; struct ucred *cred; { INIT_VNET_IPSEC(curvnet); struct sadb_x_policy *xpl; struct secpolicy **pcb_sp; /* sanity check. */ if (in6p == NULL || request == NULL) return EINVAL; if (len < sizeof(*xpl)) return EINVAL; xpl = (struct sadb_x_policy *)request; /* select direction */ switch (xpl->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: pcb_sp = &in6p->in6p_sp->sp_in; break; case IPSEC_DIR_OUTBOUND: pcb_sp = &in6p->in6p_sp->sp_out; break; default: ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__, xpl->sadb_x_policy_dir)); return EINVAL; } return ipsec_set_policy(pcb_sp, optname, request, len, cred); } int ipsec6_get_policy(in6p, request, len, mp) struct in6pcb *in6p; caddr_t request; size_t len; struct mbuf **mp; { INIT_VNET_IPSEC(curvnet); struct sadb_x_policy *xpl; struct secpolicy *pcb_sp; /* sanity check. */ if (in6p == NULL || request == NULL || mp == NULL) return EINVAL; IPSEC_ASSERT(in6p->in6p_sp != NULL, ("null in6p_sp")); if (len < sizeof(*xpl)) return EINVAL; xpl = (struct sadb_x_policy *)request; /* select direction */ switch (xpl->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: pcb_sp = in6p->in6p_sp->sp_in; break; case IPSEC_DIR_OUTBOUND: pcb_sp = in6p->in6p_sp->sp_out; break; default: ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__, xpl->sadb_x_policy_dir)); return EINVAL; } return ipsec_get_policy(pcb_sp, mp); } int ipsec6_delete_pcbpolicy(in6p) struct in6pcb *in6p; { IPSEC_ASSERT(in6p != NULL, ("null in6p")); if (in6p->in6p_sp == NULL) return 0; if (in6p->in6p_sp->sp_in != NULL) KEY_FREESP(&in6p->in6p_sp->sp_in); if (in6p->in6p_sp->sp_out != NULL) KEY_FREESP(&in6p->in6p_sp->sp_out); ipsec_delpcbpolicy(in6p->in6p_sp); in6p->in6p_sp = NULL; return 0; } #endif /* * return current level. * Either IPSEC_LEVEL_USE or IPSEC_LEVEL_REQUIRE are always returned. */ u_int ipsec_get_reqlevel(isr) struct ipsecrequest *isr; { INIT_VNET_IPSEC(curvnet); u_int level = 0; u_int esp_trans_deflev, esp_net_deflev; u_int ah_trans_deflev, ah_net_deflev; IPSEC_ASSERT(isr != NULL && isr->sp != NULL, ("null argument")); IPSEC_ASSERT(isr->sp->spidx.src.sa.sa_family == isr->sp->spidx.dst.sa.sa_family, ("af family mismatch, src %u, dst %u", isr->sp->spidx.src.sa.sa_family, isr->sp->spidx.dst.sa.sa_family)); /* XXX note that we have ipseclog() expanded here - code sync issue */ #define IPSEC_CHECK_DEFAULT(lev) \ (((lev) != IPSEC_LEVEL_USE && (lev) != IPSEC_LEVEL_REQUIRE \ && (lev) != IPSEC_LEVEL_UNIQUE) \ ? (V_ipsec_debug \ ? log(LOG_INFO, "fixed system default level " #lev ":%d->%d\n",\ (lev), IPSEC_LEVEL_REQUIRE) \ : 0), \ (lev) = IPSEC_LEVEL_REQUIRE, \ (lev) \ : (lev)) /* set default level */ switch (((struct sockaddr *)&isr->sp->spidx.src)->sa_family) { #ifdef INET case AF_INET: esp_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip4_esp_trans_deflev); esp_net_deflev = IPSEC_CHECK_DEFAULT(V_ip4_esp_net_deflev); ah_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip4_ah_trans_deflev); ah_net_deflev = IPSEC_CHECK_DEFAULT(V_ip4_ah_net_deflev); break; #endif #ifdef INET6 case AF_INET6: esp_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip6_esp_trans_deflev); esp_net_deflev = IPSEC_CHECK_DEFAULT(V_ip6_esp_net_deflev); ah_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip6_ah_trans_deflev); ah_net_deflev = IPSEC_CHECK_DEFAULT(V_ip6_ah_net_deflev); break; #endif /* INET6 */ default: panic("%s: unknown af %u", __func__, isr->sp->spidx.src.sa.sa_family); } #undef IPSEC_CHECK_DEFAULT /* set level */ switch (isr->level) { case IPSEC_LEVEL_DEFAULT: switch (isr->saidx.proto) { case IPPROTO_ESP: if (isr->saidx.mode == IPSEC_MODE_TUNNEL) level = esp_net_deflev; else level = esp_trans_deflev; break; case IPPROTO_AH: if (isr->saidx.mode == IPSEC_MODE_TUNNEL) level = ah_net_deflev; else level = ah_trans_deflev; break; case IPPROTO_IPCOMP: /* * we don't really care, as IPcomp document says that * we shouldn't compress small packets */ level = IPSEC_LEVEL_USE; break; default: panic("%s: Illegal protocol defined %u\n", __func__, isr->saidx.proto); } break; case IPSEC_LEVEL_USE: case IPSEC_LEVEL_REQUIRE: level = isr->level; break; case IPSEC_LEVEL_UNIQUE: level = IPSEC_LEVEL_REQUIRE; break; default: panic("%s: Illegal IPsec level %u\n", __func__, isr->level); } return level; } /* * Check security policy requirements against the actual * packet contents. Return one if the packet should be * reject as "invalid"; otherwiser return zero to have the * packet treated as "valid". * * OUT: * 0: valid * 1: invalid */ int ipsec_in_reject(struct secpolicy *sp, struct mbuf *m) { INIT_VNET_IPSEC(curvnet); struct ipsecrequest *isr; int need_auth; KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("%s: using SP\n", __func__); kdebug_secpolicy(sp)); /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: return 1; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: return 0; } IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC, ("invalid policy %u", sp->policy)); /* XXX should compare policy against ipsec header history */ need_auth = 0; for (isr = sp->req; isr != NULL; isr = isr->next) { if (ipsec_get_reqlevel(isr) != IPSEC_LEVEL_REQUIRE) continue; switch (isr->saidx.proto) { case IPPROTO_ESP: if ((m->m_flags & M_DECRYPTED) == 0) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: ESP m_flags:%x\n", __func__, m->m_flags)); return 1; } if (!need_auth && isr->sav != NULL && isr->sav->tdb_authalgxform != NULL && (m->m_flags & M_AUTHIPDGM) == 0) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: ESP/AH m_flags:%x\n", __func__, m->m_flags)); return 1; } break; case IPPROTO_AH: need_auth = 1; if ((m->m_flags & M_AUTHIPHDR) == 0) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: AH m_flags:%x\n", __func__, m->m_flags)); return 1; } break; case IPPROTO_IPCOMP: /* * we don't really care, as IPcomp document * says that we shouldn't compress small * packets, IPComp policy should always be * treated as being in "use" level. */ break; } } return 0; /* valid */ } /* * Check AH/ESP integrity. * This function is called from tcp_input(), udp_input(), * and {ah,esp}4_input for tunnel mode */ int ipsec4_in_reject(m, inp) struct mbuf *m; struct inpcb *inp; { INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; int error; int result; IPSEC_ASSERT(m != NULL, ("null mbuf")); /* get SP for this packet. * When we are called from ip_forward(), we call * ipsec_getpolicybyaddr() with IP_FORWARDING flag. */ if (inp == NULL) sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error); else sp = ipsec_getpolicybysock(m, IPSEC_DIR_INBOUND, inp, &error); if (sp != NULL) { result = ipsec_in_reject(sp, m); if (result) V_ipsec4stat.ips_in_polvio++; KEY_FREESP(&sp); } else { result = 0; /* XXX should be panic ? * -> No, there may be error. */ } return result; } #ifdef INET6 /* * Check AH/ESP integrity. * This function is called from tcp6_input(), udp6_input(), * and {ah,esp}6_input for tunnel mode */ int ipsec6_in_reject(m, inp) struct mbuf *m; struct inpcb *inp; { INIT_VNET_IPSEC(curvnet); struct secpolicy *sp = NULL; int error; int result; /* sanity check */ if (m == NULL) return 0; /* XXX should be panic ? */ /* get SP for this packet. * When we are called from ip_forward(), we call * ipsec_getpolicybyaddr() with IP_FORWARDING flag. */ if (inp == NULL) sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error); else sp = ipsec_getpolicybysock(m, IPSEC_DIR_INBOUND, inp, &error); if (sp != NULL) { result = ipsec_in_reject(sp, m); if (result) V_ipsec6stat.ips_in_polvio++; KEY_FREESP(&sp); } else { result = 0; } return result; } #endif /* * compute the byte size to be occupied by IPsec header. * in case it is tunneled, it includes the size of outer IP header. * NOTE: SP passed is free in this function. */ static size_t ipsec_hdrsiz(struct secpolicy *sp) { INIT_VNET_IPSEC(curvnet); struct ipsecrequest *isr; size_t siz; KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("%s: using SP\n", __func__); kdebug_secpolicy(sp)); switch (sp->policy) { case IPSEC_POLICY_DISCARD: case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: return 0; } IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC, ("invalid policy %u", sp->policy)); siz = 0; for (isr = sp->req; isr != NULL; isr = isr->next) { size_t clen = 0; switch (isr->saidx.proto) { case IPPROTO_ESP: clen = esp_hdrsiz(isr->sav); break; case IPPROTO_AH: clen = ah_hdrsiz(isr->sav); break; case IPPROTO_IPCOMP: clen = sizeof(struct ipcomp); break; } if (isr->saidx.mode == IPSEC_MODE_TUNNEL) { switch (isr->saidx.dst.sa.sa_family) { case AF_INET: clen += sizeof(struct ip); break; #ifdef INET6 case AF_INET6: clen += sizeof(struct ip6_hdr); break; #endif default: ipseclog((LOG_ERR, "%s: unknown AF %d in " "IPsec tunnel SA\n", __func__, ((struct sockaddr *)&isr->saidx.dst)->sa_family)); break; } } siz += clen; } return siz; } /* This function is called from ip_forward() and ipsec4_hdrsize_tcp(). */ size_t ipsec4_hdrsiz(m, dir, inp) struct mbuf *m; u_int dir; struct inpcb *inp; { INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; int error; size_t size; IPSEC_ASSERT(m != NULL, ("null mbuf")); /* get SP for this packet. * When we are called from ip_forward(), we call * ipsec_getpolicybyaddr() with IP_FORWARDING flag. */ if (inp == NULL) sp = ipsec_getpolicybyaddr(m, dir, IP_FORWARDING, &error); else sp = ipsec_getpolicybysock(m, dir, inp, &error); if (sp != NULL) { size = ipsec_hdrsiz(sp); KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("%s: size:%lu.\n", __func__, (unsigned long)size)); KEY_FREESP(&sp); } else { size = 0; /* XXX should be panic ? * -> No, we are called w/o knowing if * IPsec processing is needed. */ } return size; } #ifdef INET6 /* This function is called from ipsec6_hdrsize_tcp(), * and maybe from ip6_forward.() */ size_t ipsec6_hdrsiz(m, dir, in6p) struct mbuf *m; u_int dir; struct in6pcb *in6p; { INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; int error; size_t size; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(in6p == NULL || in6p->in6p_socket != NULL, ("socket w/o inpcb")); /* get SP for this packet */ /* XXX Is it right to call with IP_FORWARDING. */ if (in6p == NULL) sp = ipsec_getpolicybyaddr(m, dir, IP_FORWARDING, &error); else sp = ipsec_getpolicybysock(m, dir, in6p, &error); if (sp == NULL) return 0; size = ipsec_hdrsiz(sp); KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("%s: size:%lu.\n", __func__, (unsigned long)size)); KEY_FREESP(&sp); return size; } #endif /*INET6*/ /* * Check the variable replay window. * ipsec_chkreplay() performs replay check before ICV verification. * ipsec_updatereplay() updates replay bitmap. This must be called after * ICV verification (it also performs replay check, which is usually done * beforehand). * 0 (zero) is returned if packet disallowed, 1 if packet permitted. * * based on RFC 2401. */ int ipsec_chkreplay(seq, sav) u_int32_t seq; struct secasvar *sav; { const struct secreplay *replay; u_int32_t diff; int fr; u_int32_t wsizeb; /* constant: bits of window size */ int frlast; /* constant: last frame */ IPSEC_ASSERT(sav != NULL, ("Null SA")); IPSEC_ASSERT(sav->replay != NULL, ("Null replay state")); replay = sav->replay; if (replay->wsize == 0) return 1; /* no need to check replay. */ /* constant */ frlast = replay->wsize - 1; wsizeb = replay->wsize << 3; /* sequence number of 0 is invalid */ if (seq == 0) return 0; /* first time is always okay */ if (replay->count == 0) return 1; if (seq > replay->lastseq) { /* larger sequences are okay */ return 1; } else { /* seq is equal or less than lastseq. */ diff = replay->lastseq - seq; /* over range to check, i.e. too old or wrapped */ if (diff >= wsizeb) return 0; fr = frlast - diff / 8; /* this packet already seen ? */ if ((replay->bitmap)[fr] & (1 << (diff % 8))) return 0; /* out of order but good */ return 1; } } /* * check replay counter whether to update or not. * OUT: 0: OK * 1: NG */ int ipsec_updatereplay(seq, sav) u_int32_t seq; struct secasvar *sav; { INIT_VNET_IPSEC(curvnet); struct secreplay *replay; u_int32_t diff; int fr; u_int32_t wsizeb; /* constant: bits of window size */ int frlast; /* constant: last frame */ IPSEC_ASSERT(sav != NULL, ("Null SA")); IPSEC_ASSERT(sav->replay != NULL, ("Null replay state")); replay = sav->replay; if (replay->wsize == 0) goto ok; /* no need to check replay. */ /* constant */ frlast = replay->wsize - 1; wsizeb = replay->wsize << 3; /* sequence number of 0 is invalid */ if (seq == 0) return 1; /* first time */ if (replay->count == 0) { replay->lastseq = seq; bzero(replay->bitmap, replay->wsize); (replay->bitmap)[frlast] = 1; goto ok; } if (seq > replay->lastseq) { /* seq is larger than lastseq. */ diff = seq - replay->lastseq; /* new larger sequence number */ if (diff < wsizeb) { /* In window */ /* set bit for this packet */ vshiftl(replay->bitmap, diff, replay->wsize); (replay->bitmap)[frlast] |= 1; } else { /* this packet has a "way larger" */ bzero(replay->bitmap, replay->wsize); (replay->bitmap)[frlast] = 1; } replay->lastseq = seq; /* larger is good */ } else { /* seq is equal or less than lastseq. */ diff = replay->lastseq - seq; /* over range to check, i.e. too old or wrapped */ if (diff >= wsizeb) return 1; fr = frlast - diff / 8; /* this packet already seen ? */ if ((replay->bitmap)[fr] & (1 << (diff % 8))) return 1; /* mark as seen */ (replay->bitmap)[fr] |= (1 << (diff % 8)); /* out of order but good */ } ok: if (replay->count == ~0) { /* set overflow flag */ replay->overflow++; /* don't increment, no more packets accepted */ if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0) return 1; ipseclog((LOG_WARNING, "%s: replay counter made %d cycle. %s\n", __func__, replay->overflow, ipsec_logsastr(sav))); } replay->count++; return 0; } /* * shift variable length buffer to left. * IN: bitmap: pointer to the buffer * nbit: the number of to shift. * wsize: buffer size (bytes). */ static void vshiftl(bitmap, nbit, wsize) unsigned char *bitmap; int nbit, wsize; { int s, j, i; unsigned char over; for (j = 0; j < nbit; j += 8) { s = (nbit - j < 8) ? (nbit - j): 8; bitmap[0] <<= s; for (i = 1; i < wsize; i++) { over = (bitmap[i] >> (8 - s)); bitmap[i] <<= s; bitmap[i-1] |= over; } } return; } /* Return a printable string for the IPv4 address. */ static char * inet_ntoa4(struct in_addr ina) { static char buf[4][4 * sizeof "123" + 4]; unsigned char *ucp = (unsigned char *) &ina; static int i = 3; /* XXX-BZ returns static buffer. */ i = (i + 1) % 4; sprintf(buf[i], "%d.%d.%d.%d", ucp[0] & 0xff, ucp[1] & 0xff, ucp[2] & 0xff, ucp[3] & 0xff); return (buf[i]); } /* Return a printable string for the address. */ char * ipsec_address(union sockaddr_union* sa) { #ifdef INET6 char ip6buf[INET6_ADDRSTRLEN]; #endif switch (sa->sa.sa_family) { #ifdef INET case AF_INET: return inet_ntoa4(sa->sin.sin_addr); #endif /* INET */ #ifdef INET6 case AF_INET6: return ip6_sprintf(ip6buf, &sa->sin6.sin6_addr); #endif /* INET6 */ default: return "(unknown address family)"; } } const char * ipsec_logsastr(sav) struct secasvar *sav; { static char buf[256]; char *p; struct secasindex *saidx = &sav->sah->saidx; IPSEC_ASSERT(saidx->src.sa.sa_family == saidx->dst.sa.sa_family, ("address family mismatch")); p = buf; snprintf(buf, sizeof(buf), "SA(SPI=%u ", (u_int32_t)ntohl(sav->spi)); while (p && *p) p++; /* NB: only use ipsec_address on one address at a time */ snprintf(p, sizeof (buf) - (p - buf), "src=%s ", ipsec_address(&saidx->src)); while (p && *p) p++; snprintf(p, sizeof (buf) - (p - buf), "dst=%s)", ipsec_address(&saidx->dst)); return buf; } void ipsec_dumpmbuf(m) struct mbuf *m; { int totlen; int i; u_char *p; totlen = 0; printf("---\n"); while (m) { p = mtod(m, u_char *); for (i = 0; i < m->m_len; i++) { printf("%02x ", p[i]); totlen++; if (totlen % 16 == 0) printf("\n"); } m = m->m_next; } if (totlen % 16 != 0) printf("\n"); printf("---\n"); } static void ipsec_attach(void) { SECPOLICY_LOCK_INIT(&V_ip4_def_policy); ip4_def_policy.refcnt = 1; /* NB: disallow free */ } SYSINIT(ipsec, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, ipsec_attach, NULL); /* XXX this stuff doesn't belong here... */ static struct xformsw* xforms = NULL; /* * Register a transform; typically at system startup. */ void xform_register(struct xformsw* xsp) { xsp->xf_next = xforms; xforms = xsp; } /* * Initialize transform support in an sav. */ int xform_init(struct secasvar *sav, int xftype) { struct xformsw *xsp; if (sav->tdb_xform != NULL) /* previously initialized */ return 0; for (xsp = xforms; xsp; xsp = xsp->xf_next) if (xsp->xf_type == xftype) return (*xsp->xf_init)(sav, xsp); return EINVAL; } Index: head/sys/netipsec/ipsec.h =================================================================== --- head/sys/netipsec/ipsec.h (revision 185087) +++ head/sys/netipsec/ipsec.h (revision 185088) @@ -1,441 +1,442 @@ /* $FreeBSD$ */ /* $KAME: ipsec.h,v 1.53 2001/11/20 08:32:38 itojun Exp $ */ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPsec controller part. */ #ifndef _NETIPSEC_IPSEC_H_ #define _NETIPSEC_IPSEC_H_ #if defined(_KERNEL) && !defined(_LKM) && !defined(KLD_MODULE) #include "opt_inet.h" #include "opt_ipsec.h" #endif #include #include #ifdef _KERNEL #define IPSEC_ASSERT(_c,_m) KASSERT(_c, _m) #define IPSEC_IS_PRIVILEGED_SO(_so) \ ((_so)->so_cred != NULL && \ priv_check_cred((_so)->so_cred, PRIV_NETINET_IPSEC, 0) \ == 0) /* * Security Policy Index * Ensure that both address families in the "src" and "dst" are same. * When the value of the ul_proto is ICMPv6, the port field in "src" * specifies ICMPv6 type, and the port field in "dst" specifies ICMPv6 code. */ struct secpolicyindex { u_int8_t dir; /* direction of packet flow, see blow */ union sockaddr_union src; /* IP src address for SP */ union sockaddr_union dst; /* IP dst address for SP */ u_int8_t prefs; /* prefix length in bits for src */ u_int8_t prefd; /* prefix length in bits for dst */ u_int16_t ul_proto; /* upper layer Protocol */ #ifdef notyet uid_t uids; uid_t uidd; gid_t gids; gid_t gidd; #endif }; /* Security Policy Data Base */ struct secpolicy { LIST_ENTRY(secpolicy) chain; struct mtx lock; u_int refcnt; /* reference count */ struct secpolicyindex spidx; /* selector */ u_int32_t id; /* It's unique number on the system. */ u_int state; /* 0: dead, others: alive */ #define IPSEC_SPSTATE_DEAD 0 #define IPSEC_SPSTATE_ALIVE 1 u_int16_t policy; /* policy_type per pfkeyv2.h */ u_int16_t scangen; /* scan generation # */ struct ipsecrequest *req; /* pointer to the ipsec request tree, */ /* if policy == IPSEC else this value == NULL.*/ /* * lifetime handler. * the policy can be used without limitiation if both lifetime and * validtime are zero. * "lifetime" is passed by sadb_lifetime.sadb_lifetime_addtime. * "validtime" is passed by sadb_lifetime.sadb_lifetime_usetime. */ time_t created; /* time created the policy */ time_t lastused; /* updated every when kernel sends a packet */ long lifetime; /* duration of the lifetime of this policy */ long validtime; /* duration this policy is valid without use */ }; #define SECPOLICY_LOCK_INIT(_sp) \ mtx_init(&(_sp)->lock, "ipsec policy", NULL, MTX_DEF) #define SECPOLICY_LOCK(_sp) mtx_lock(&(_sp)->lock) #define SECPOLICY_UNLOCK(_sp) mtx_unlock(&(_sp)->lock) #define SECPOLICY_LOCK_DESTROY(_sp) mtx_destroy(&(_sp)->lock) #define SECPOLICY_LOCK_ASSERT(_sp) mtx_assert(&(_sp)->lock, MA_OWNED) /* Request for IPsec */ struct ipsecrequest { struct ipsecrequest *next; /* pointer to next structure */ /* If NULL, it means the end of chain. */ struct secasindex saidx;/* hint for search proper SA */ /* if __ss_len == 0 then no address specified.*/ u_int level; /* IPsec level defined below. */ struct secasvar *sav; /* place holder of SA for use */ struct secpolicy *sp; /* back pointer to SP */ struct mtx lock; /* to interlock updates */ }; /* * Need recursion for when crypto callbacks happen directly, * as in the case of software crypto. Need to look at how * hard it is to remove this... */ #define IPSECREQUEST_LOCK_INIT(_isr) \ mtx_init(&(_isr)->lock, "ipsec request", NULL, MTX_DEF | MTX_RECURSE) #define IPSECREQUEST_LOCK(_isr) mtx_lock(&(_isr)->lock) #define IPSECREQUEST_UNLOCK(_isr) mtx_unlock(&(_isr)->lock) #define IPSECREQUEST_LOCK_DESTROY(_isr) mtx_destroy(&(_isr)->lock) #define IPSECREQUEST_LOCK_ASSERT(_isr) mtx_assert(&(_isr)->lock, MA_OWNED) /* security policy in PCB */ struct inpcbpolicy { struct secpolicy *sp_in; struct secpolicy *sp_out; int priv; /* privileged socket ? */ }; /* SP acquiring list table. */ struct secspacq { LIST_ENTRY(secspacq) chain; struct secpolicyindex spidx; time_t created; /* for lifetime */ int count; /* for lifetime */ /* XXX: here is mbuf place holder to be sent ? */ }; #endif /* _KERNEL */ /* according to IANA assignment, port 0x0000 and proto 0xff are reserved. */ #define IPSEC_PORT_ANY 0 #define IPSEC_ULPROTO_ANY 255 #define IPSEC_PROTO_ANY 255 /* mode of security protocol */ /* NOTE: DON'T use IPSEC_MODE_ANY at SPD. It's only use in SAD */ #define IPSEC_MODE_ANY 0 /* i.e. wildcard. */ #define IPSEC_MODE_TRANSPORT 1 #define IPSEC_MODE_TUNNEL 2 #define IPSEC_MODE_TCPMD5 3 /* TCP MD5 mode */ /* * Direction of security policy. * NOTE: Since INVALID is used just as flag. * The other are used for loop counter too. */ #define IPSEC_DIR_ANY 0 #define IPSEC_DIR_INBOUND 1 #define IPSEC_DIR_OUTBOUND 2 #define IPSEC_DIR_MAX 3 #define IPSEC_DIR_INVALID 4 /* Policy level */ /* * IPSEC, ENTRUST and BYPASS are allowed for setsockopt() in PCB, * DISCARD, IPSEC and NONE are allowed for setkey() in SPD. * DISCARD and NONE are allowed for system default. */ #define IPSEC_POLICY_DISCARD 0 /* discarding packet */ #define IPSEC_POLICY_NONE 1 /* through IPsec engine */ #define IPSEC_POLICY_IPSEC 2 /* do IPsec */ #define IPSEC_POLICY_ENTRUST 3 /* consulting SPD if present. */ #define IPSEC_POLICY_BYPASS 4 /* only for privileged socket. */ /* Security protocol level */ #define IPSEC_LEVEL_DEFAULT 0 /* reference to system default */ #define IPSEC_LEVEL_USE 1 /* use SA if present. */ #define IPSEC_LEVEL_REQUIRE 2 /* require SA. */ #define IPSEC_LEVEL_UNIQUE 3 /* unique SA. */ #define IPSEC_MANUAL_REQID_MAX 0x3fff /* * if security policy level == unique, this id * indicate to a relative SA for use, else is * zero. * 1 - 0x3fff are reserved for manual keying. * 0 are reserved for above reason. Others is * for kernel use. * Note that this id doesn't identify SA * by only itself. */ #define IPSEC_REPLAYWSIZE 32 /* statistics for ipsec processing */ struct ipsecstat { u_quad_t in_success; /* succeeded inbound process */ u_quad_t in_polvio; /* security policy violation for inbound process */ u_quad_t in_nosa; /* inbound SA is unavailable */ u_quad_t in_inval; /* inbound processing failed due to EINVAL */ u_quad_t in_nomem; /* inbound processing failed due to ENOBUFS */ u_quad_t in_badspi; /* failed getting a SPI */ u_quad_t in_ahreplay; /* AH replay check failed */ u_quad_t in_espreplay; /* ESP replay check failed */ u_quad_t in_ahauthsucc; /* AH authentication success */ u_quad_t in_ahauthfail; /* AH authentication failure */ u_quad_t in_espauthsucc; /* ESP authentication success */ u_quad_t in_espauthfail; /* ESP authentication failure */ u_quad_t in_esphist[256]; u_quad_t in_ahhist[256]; u_quad_t in_comphist[256]; u_quad_t out_success; /* succeeded outbound process */ u_quad_t out_polvio; /* security policy violation for outbound process */ u_quad_t out_nosa; /* outbound SA is unavailable */ u_quad_t out_inval; /* outbound process failed due to EINVAL */ u_quad_t out_nomem; /* inbound processing failed due to ENOBUFS */ u_quad_t out_noroute; /* there is no route */ u_quad_t out_esphist[256]; u_quad_t out_ahhist[256]; u_quad_t out_comphist[256]; u_quad_t spdcachelookup; u_quad_t spdcachemiss; u_int32_t ips_in_polvio; /* input: sec policy violation */ u_int32_t ips_out_polvio; /* output: sec policy violation */ u_int32_t ips_out_nosa; /* output: SA unavailable */ u_int32_t ips_out_nomem; /* output: no memory available */ u_int32_t ips_out_noroute; /* output: no route available */ u_int32_t ips_out_inval; /* output: generic error */ u_int32_t ips_out_bundlesa; /* output: bundled SA processed */ u_int32_t ips_mbcoalesced; /* mbufs coalesced during clone */ u_int32_t ips_clcoalesced; /* clusters coalesced during clone */ u_int32_t ips_clcopied; /* clusters copied during clone */ u_int32_t ips_mbinserted; /* mbufs inserted during makespace */ /* * Temporary statistics for performance analysis. */ /* See where ESP/AH/IPCOMP header land in mbuf on input */ u_int32_t ips_input_front; u_int32_t ips_input_middle; u_int32_t ips_input_end; }; /* * Definitions for IPsec & Key sysctl operations. */ /* * Names for IPsec & Key sysctl objects */ #define IPSECCTL_STATS 1 /* stats */ #define IPSECCTL_DEF_POLICY 2 #define IPSECCTL_DEF_ESP_TRANSLEV 3 /* int; ESP transport mode */ #define IPSECCTL_DEF_ESP_NETLEV 4 /* int; ESP tunnel mode */ #define IPSECCTL_DEF_AH_TRANSLEV 5 /* int; AH transport mode */ #define IPSECCTL_DEF_AH_NETLEV 6 /* int; AH tunnel mode */ #if 0 /* obsolete, do not reuse */ #define IPSECCTL_INBOUND_CALL_IKE 7 #endif #define IPSECCTL_AH_CLEARTOS 8 #define IPSECCTL_AH_OFFSETMASK 9 #define IPSECCTL_DFBIT 10 #define IPSECCTL_ECN 11 #define IPSECCTL_DEBUG 12 #define IPSECCTL_ESP_RANDPAD 13 #define IPSECCTL_MAXID 14 #define IPSECCTL_NAMES { \ { 0, 0 }, \ { 0, 0 }, \ { "def_policy", CTLTYPE_INT }, \ { "esp_trans_deflev", CTLTYPE_INT }, \ { "esp_net_deflev", CTLTYPE_INT }, \ { "ah_trans_deflev", CTLTYPE_INT }, \ { "ah_net_deflev", CTLTYPE_INT }, \ { 0, 0 }, \ { "ah_cleartos", CTLTYPE_INT }, \ { "ah_offsetmask", CTLTYPE_INT }, \ { "dfbit", CTLTYPE_INT }, \ { "ecn", CTLTYPE_INT }, \ { "debug", CTLTYPE_INT }, \ { "esp_randpad", CTLTYPE_INT }, \ } #define IPSEC6CTL_NAMES { \ { 0, 0 }, \ { 0, 0 }, \ { "def_policy", CTLTYPE_INT }, \ { "esp_trans_deflev", CTLTYPE_INT }, \ { "esp_net_deflev", CTLTYPE_INT }, \ { "ah_trans_deflev", CTLTYPE_INT }, \ { "ah_net_deflev", CTLTYPE_INT }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { "ecn", CTLTYPE_INT }, \ { "debug", CTLTYPE_INT }, \ { "esp_randpad", CTLTYPE_INT }, \ } #ifdef _KERNEL struct ipsec_output_state { struct mbuf *m; struct route *ro; struct sockaddr *dst; }; struct ipsec_history { int ih_proto; u_int32_t ih_spi; }; extern int ipsec_debug; #ifdef REGRESSION extern int ipsec_replay; extern int ipsec_integrity; #endif extern struct ipsecstat ipsec4stat; extern struct secpolicy ip4_def_policy; extern int ip4_esp_trans_deflev; extern int ip4_esp_net_deflev; extern int ip4_ah_trans_deflev; extern int ip4_ah_net_deflev; extern int ip4_ah_cleartos; extern int ip4_ah_offsetmask; extern int ip4_ipsec_dfbit; extern int ip4_ipsec_ecn; extern int ip4_esp_randpad; extern int crypto_support; #define ipseclog(x) do { if (V_ipsec_debug) log x; } while (0) /* for openbsd compatibility */ #define DPRINTF(x) do { if (V_ipsec_debug) printf x; } while (0) extern struct ipsecrequest *ipsec_newisr(void); extern void ipsec_delisr(struct ipsecrequest *); struct tdb_ident; +extern void ipsec_init(void); extern struct secpolicy *ipsec_getpolicy __P((struct tdb_ident*, u_int)); struct inpcb; extern struct secpolicy *ipsec4_checkpolicy __P((struct mbuf *, u_int, u_int, int *, struct inpcb *)); extern struct secpolicy *ipsec_getpolicybysock(struct mbuf *, u_int, struct inpcb *, int *); extern struct secpolicy * ipsec_getpolicybyaddr(struct mbuf *, u_int, int, int *); struct inpcb; extern int ipsec_init_policy __P((struct socket *so, struct inpcbpolicy **)); extern int ipsec_copy_policy __P((struct inpcbpolicy *, struct inpcbpolicy *)); extern u_int ipsec_get_reqlevel __P((struct ipsecrequest *)); extern int ipsec_in_reject __P((struct secpolicy *, struct mbuf *)); extern int ipsec4_set_policy __P((struct inpcb *inp, int optname, caddr_t request, size_t len, struct ucred *cred)); extern int ipsec4_get_policy __P((struct inpcb *inpcb, caddr_t request, size_t len, struct mbuf **mp)); extern int ipsec4_delete_pcbpolicy __P((struct inpcb *)); extern int ipsec4_in_reject __P((struct mbuf *, struct inpcb *)); struct secas; struct tcpcb; extern int ipsec_chkreplay __P((u_int32_t, struct secasvar *)); extern int ipsec_updatereplay __P((u_int32_t, struct secasvar *)); extern size_t ipsec4_hdrsiz __P((struct mbuf *, u_int, struct inpcb *)); extern size_t ipsec_hdrsiz_tcp __P((struct tcpcb *)); union sockaddr_union; extern char * ipsec_address(union sockaddr_union* sa); extern const char *ipsec_logsastr __P((struct secasvar *)); extern void ipsec_dumpmbuf __P((struct mbuf *)); struct m_tag; extern void ah4_input(struct mbuf *m, int off); extern void ah4_ctlinput(int cmd, struct sockaddr *sa, void *); extern void esp4_input(struct mbuf *m, int off); extern void esp4_ctlinput(int cmd, struct sockaddr *sa, void *); extern void ipcomp4_input(struct mbuf *m, int off); extern int ipsec4_common_input(struct mbuf *m, ...); extern int ipsec4_common_input_cb(struct mbuf *m, struct secasvar *sav, int skip, int protoff, struct m_tag *mt); extern int ipsec4_process_packet __P((struct mbuf *, struct ipsecrequest *, int, int)); extern int ipsec_process_done __P((struct mbuf *, struct ipsecrequest *)); extern struct mbuf *ipsec_copypkt __P((struct mbuf *)); extern void m_checkalignment(const char* where, struct mbuf *m0, int off, int len); extern struct mbuf *m_makespace(struct mbuf *m0, int skip, int hlen, int *off); extern caddr_t m_pad(struct mbuf *m, int n); extern int m_striphdr(struct mbuf *m, int skip, int hlen); #ifdef DEV_ENC #define ENC_BEFORE 0x0001 #define ENC_AFTER 0x0002 #define ENC_IN 0x0100 #define ENC_OUT 0x0200 extern int ipsec_filter(struct mbuf **, int, int); extern void ipsec_bpf(struct mbuf *, struct secasvar *, int, int); #endif #endif /* _KERNEL */ #ifndef _KERNEL extern caddr_t ipsec_set_policy __P((char *, int)); extern int ipsec_get_policylen __P((caddr_t)); extern char *ipsec_dump_policy __P((caddr_t, char *)); extern const char *ipsec_strerror __P((void)); #else #include #endif /* ! KERNEL */ #endif /* _NETIPSEC_IPSEC_H_ */ Index: head/sys/netipsec/key.c =================================================================== --- head/sys/netipsec/key.c (revision 185087) +++ head/sys/netipsec/key.c (revision 185088) @@ -1,7446 +1,7466 @@ /* $FreeBSD$ */ /* $KAME: key.c,v 1.191 2001/06/27 10:46:49 sakane Exp $ */ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This code is referd to RFC 2367 */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif /* INET6 */ #ifdef INET #include #endif #ifdef INET6 #include #endif /* INET6 */ #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include /* randomness */ #include #include #define FULLMASK 0xff #define _BITS(bytes) ((bytes) << 3) /* * Note on SA reference counting: * - SAs that are not in DEAD state will have (total external reference + 1) * following value in reference count field. they cannot be freed and are * referenced from SA header. * - SAs that are in DEAD state will have (total external reference) * in reference count field. they are ready to be freed. reference from * SA header will be removed in key_delsav(), when the reference count * field hits 0 (= no external reference other than from SA header. */ -u_int32_t key_debug_level = 0; -static u_int key_spi_trycnt = 1000; -static u_int32_t key_spi_minval = 0x100; -static u_int32_t key_spi_maxval = 0x0fffffff; /* XXX */ -static u_int32_t policy_id = 0; -static u_int key_int_random = 60; /*interval to initialize randseed,1(m)*/ -static u_int key_larval_lifetime = 30; /* interval to expire acquiring, 30(s)*/ -static int key_blockacq_count = 10; /* counter for blocking SADB_ACQUIRE.*/ -static int key_blockacq_lifetime = 20; /* lifetime for blocking SADB_ACQUIRE.*/ -static int key_preferred_oldsa = 1; /* preferred old sa rather than new sa.*/ +#ifdef VIMAGE_GLOBALS +u_int32_t key_debug_level; +static u_int key_spi_trycnt; +static u_int32_t key_spi_minval; +static u_int32_t key_spi_maxval; +static u_int32_t policy_id; +static u_int key_int_random; +static u_int key_larval_lifetime; +static int key_blockacq_count; +static int key_blockacq_lifetime; +static int key_preferred_oldsa; -static u_int32_t acq_seq = 0; +static u_int32_t acq_seq; +static int ipsec_esp_keymin; +static int ipsec_esp_auth; +static int ipsec_ah_keymin; + static LIST_HEAD(_sptree, secpolicy) sptree[IPSEC_DIR_MAX]; /* SPD */ +static LIST_HEAD(_sahtree, secashead) sahtree; /* SAD */ +static LIST_HEAD(_regtree, secreg) regtree[SADB_SATYPE_MAX + 1]; +static LIST_HEAD(_acqtree, secacq) acqtree; /* acquiring list */ +static LIST_HEAD(_spacqtree, secspacq) spacqtree; /* SP acquiring list */ +#endif /* VIMAGE_GLOBALS */ + static struct mtx sptree_lock; #define SPTREE_LOCK_INIT() \ mtx_init(&sptree_lock, "sptree", \ "fast ipsec security policy database", MTX_DEF) #define SPTREE_LOCK_DESTROY() mtx_destroy(&sptree_lock) #define SPTREE_LOCK() mtx_lock(&sptree_lock) #define SPTREE_UNLOCK() mtx_unlock(&sptree_lock) #define SPTREE_LOCK_ASSERT() mtx_assert(&sptree_lock, MA_OWNED) -static LIST_HEAD(_sahtree, secashead) sahtree; /* SAD */ static struct mtx sahtree_lock; #define SAHTREE_LOCK_INIT() \ mtx_init(&sahtree_lock, "sahtree", \ "fast ipsec security association database", MTX_DEF) #define SAHTREE_LOCK_DESTROY() mtx_destroy(&sahtree_lock) #define SAHTREE_LOCK() mtx_lock(&sahtree_lock) #define SAHTREE_UNLOCK() mtx_unlock(&sahtree_lock) #define SAHTREE_LOCK_ASSERT() mtx_assert(&sahtree_lock, MA_OWNED) /* registed list */ -static LIST_HEAD(_regtree, secreg) regtree[SADB_SATYPE_MAX + 1]; static struct mtx regtree_lock; #define REGTREE_LOCK_INIT() \ mtx_init(®tree_lock, "regtree", "fast ipsec regtree", MTX_DEF) #define REGTREE_LOCK_DESTROY() mtx_destroy(®tree_lock) #define REGTREE_LOCK() mtx_lock(®tree_lock) #define REGTREE_UNLOCK() mtx_unlock(®tree_lock) #define REGTREE_LOCK_ASSERT() mtx_assert(®tree_lock, MA_OWNED) -static LIST_HEAD(_acqtree, secacq) acqtree; /* acquiring list */ static struct mtx acq_lock; #define ACQ_LOCK_INIT() \ mtx_init(&acq_lock, "acqtree", "fast ipsec acquire list", MTX_DEF) #define ACQ_LOCK_DESTROY() mtx_destroy(&acq_lock) #define ACQ_LOCK() mtx_lock(&acq_lock) #define ACQ_UNLOCK() mtx_unlock(&acq_lock) #define ACQ_LOCK_ASSERT() mtx_assert(&acq_lock, MA_OWNED) -static LIST_HEAD(_spacqtree, secspacq) spacqtree; /* SP acquiring list */ static struct mtx spacq_lock; #define SPACQ_LOCK_INIT() \ mtx_init(&spacq_lock, "spacqtree", \ "fast ipsec security policy acquire list", MTX_DEF) #define SPACQ_LOCK_DESTROY() mtx_destroy(&spacq_lock) #define SPACQ_LOCK() mtx_lock(&spacq_lock) #define SPACQ_UNLOCK() mtx_unlock(&spacq_lock) #define SPACQ_LOCK_ASSERT() mtx_assert(&spacq_lock, MA_OWNED) /* search order for SAs */ static const u_int saorder_state_valid_prefer_old[] = { SADB_SASTATE_DYING, SADB_SASTATE_MATURE, }; static const u_int saorder_state_valid_prefer_new[] = { SADB_SASTATE_MATURE, SADB_SASTATE_DYING, }; static u_int saorder_state_alive[] = { /* except DEAD */ SADB_SASTATE_MATURE, SADB_SASTATE_DYING, SADB_SASTATE_LARVAL }; static u_int saorder_state_any[] = { SADB_SASTATE_MATURE, SADB_SASTATE_DYING, SADB_SASTATE_LARVAL, SADB_SASTATE_DEAD }; static const int minsize[] = { sizeof(struct sadb_msg), /* SADB_EXT_RESERVED */ sizeof(struct sadb_sa), /* SADB_EXT_SA */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_CURRENT */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_HARD */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_SOFT */ sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_SRC */ sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_DST */ sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_PROXY */ sizeof(struct sadb_key), /* SADB_EXT_KEY_AUTH */ sizeof(struct sadb_key), /* SADB_EXT_KEY_ENCRYPT */ sizeof(struct sadb_ident), /* SADB_EXT_IDENTITY_SRC */ sizeof(struct sadb_ident), /* SADB_EXT_IDENTITY_DST */ sizeof(struct sadb_sens), /* SADB_EXT_SENSITIVITY */ sizeof(struct sadb_prop), /* SADB_EXT_PROPOSAL */ sizeof(struct sadb_supported), /* SADB_EXT_SUPPORTED_AUTH */ sizeof(struct sadb_supported), /* SADB_EXT_SUPPORTED_ENCRYPT */ sizeof(struct sadb_spirange), /* SADB_EXT_SPIRANGE */ 0, /* SADB_X_EXT_KMPRIVATE */ sizeof(struct sadb_x_policy), /* SADB_X_EXT_POLICY */ sizeof(struct sadb_x_sa2), /* SADB_X_SA2 */ }; static const int maxsize[] = { sizeof(struct sadb_msg), /* SADB_EXT_RESERVED */ sizeof(struct sadb_sa), /* SADB_EXT_SA */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_CURRENT */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_HARD */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_SOFT */ 0, /* SADB_EXT_ADDRESS_SRC */ 0, /* SADB_EXT_ADDRESS_DST */ 0, /* SADB_EXT_ADDRESS_PROXY */ 0, /* SADB_EXT_KEY_AUTH */ 0, /* SADB_EXT_KEY_ENCRYPT */ 0, /* SADB_EXT_IDENTITY_SRC */ 0, /* SADB_EXT_IDENTITY_DST */ 0, /* SADB_EXT_SENSITIVITY */ 0, /* SADB_EXT_PROPOSAL */ 0, /* SADB_EXT_SUPPORTED_AUTH */ 0, /* SADB_EXT_SUPPORTED_ENCRYPT */ sizeof(struct sadb_spirange), /* SADB_EXT_SPIRANGE */ 0, /* SADB_X_EXT_KMPRIVATE */ 0, /* SADB_X_EXT_POLICY */ sizeof(struct sadb_x_sa2), /* SADB_X_SA2 */ }; -static int ipsec_esp_keymin = 256; -static int ipsec_esp_auth = 0; -static int ipsec_ah_keymin = 128; - #ifdef SYSCTL_DECL SYSCTL_DECL(_net_key); #endif SYSCTL_V_INT(V_NET, vnet_ipsec,_net_key, KEYCTL_DEBUG_LEVEL, debug, CTLFLAG_RW, key_debug_level, 0, ""); /* max count of trial for the decision of spi value */ SYSCTL_V_INT(V_NET, vnet_ipsec,_net_key, KEYCTL_SPI_TRY, spi_trycnt, CTLFLAG_RW, key_spi_trycnt, 0, ""); /* minimum spi value to allocate automatically. */ SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_SPI_MIN_VALUE, spi_minval, CTLFLAG_RW, key_spi_minval, 0, ""); /* maximun spi value to allocate automatically. */ SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_SPI_MAX_VALUE, spi_maxval, CTLFLAG_RW, key_spi_maxval, 0, ""); /* interval to initialize randseed */ SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_RANDOM_INT, int_random, CTLFLAG_RW, key_int_random, 0, ""); /* lifetime for larval SA */ SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_LARVAL_LIFETIME, larval_lifetime, CTLFLAG_RW, key_larval_lifetime, 0, ""); /* counter for blocking to send SADB_ACQUIRE to IKEd */ SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_BLOCKACQ_COUNT, blockacq_count, CTLFLAG_RW, key_blockacq_count, 0, ""); /* lifetime for blocking to send SADB_ACQUIRE to IKEd */ SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_BLOCKACQ_LIFETIME, blockacq_lifetime, CTLFLAG_RW, key_blockacq_lifetime, 0, ""); /* ESP auth */ SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_ESP_AUTH, esp_auth, CTLFLAG_RW, ipsec_esp_auth, 0, ""); /* minimum ESP key length */ SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_ESP_KEYMIN, esp_keymin, CTLFLAG_RW, ipsec_esp_keymin, 0, ""); /* minimum AH key length */ SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_AH_KEYMIN, ah_keymin, CTLFLAG_RW, ipsec_ah_keymin, 0, ""); /* perfered old SA rather than new SA */ SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_PREFERED_OLDSA, preferred_oldsa, CTLFLAG_RW, key_preferred_oldsa, 0, ""); #define __LIST_CHAINED(elm) \ (!((elm)->chain.le_next == NULL && (elm)->chain.le_prev == NULL)) #define LIST_INSERT_TAIL(head, elm, type, field) \ do {\ struct type *curelm = LIST_FIRST(head); \ if (curelm == NULL) {\ LIST_INSERT_HEAD(head, elm, field); \ } else { \ while (LIST_NEXT(curelm, field)) \ curelm = LIST_NEXT(curelm, field);\ LIST_INSERT_AFTER(curelm, elm, field);\ }\ } while (0) #define KEY_CHKSASTATE(head, sav, name) \ do { \ if ((head) != (sav)) { \ ipseclog((LOG_DEBUG, "%s: state mismatched (TREE=%d SA=%d)\n", \ (name), (head), (sav))); \ continue; \ } \ } while (0) #define KEY_CHKSPDIR(head, sp, name) \ do { \ if ((head) != (sp)) { \ ipseclog((LOG_DEBUG, "%s: direction mismatched (TREE=%d SP=%d), " \ "anyway continue.\n", \ (name), (head), (sp))); \ } \ } while (0) MALLOC_DEFINE(M_IPSEC_SA, "secasvar", "ipsec security association"); MALLOC_DEFINE(M_IPSEC_SAH, "sahead", "ipsec sa head"); MALLOC_DEFINE(M_IPSEC_SP, "ipsecpolicy", "ipsec security policy"); MALLOC_DEFINE(M_IPSEC_SR, "ipsecrequest", "ipsec security request"); MALLOC_DEFINE(M_IPSEC_MISC, "ipsec-misc", "ipsec miscellaneous"); MALLOC_DEFINE(M_IPSEC_SAQ, "ipsec-saq", "ipsec sa acquire"); MALLOC_DEFINE(M_IPSEC_SAR, "ipsec-reg", "ipsec sa acquire"); /* * set parameters into secpolicyindex buffer. * Must allocate secpolicyindex buffer passed to this function. */ #define KEY_SETSECSPIDX(_dir, s, d, ps, pd, ulp, idx) \ do { \ bzero((idx), sizeof(struct secpolicyindex)); \ (idx)->dir = (_dir); \ (idx)->prefs = (ps); \ (idx)->prefd = (pd); \ (idx)->ul_proto = (ulp); \ bcopy((s), &(idx)->src, ((const struct sockaddr *)(s))->sa_len); \ bcopy((d), &(idx)->dst, ((const struct sockaddr *)(d))->sa_len); \ } while (0) /* * set parameters into secasindex buffer. * Must allocate secasindex buffer before calling this function. */ #define KEY_SETSECASIDX(p, m, r, s, d, idx) \ do { \ bzero((idx), sizeof(struct secasindex)); \ (idx)->proto = (p); \ (idx)->mode = (m); \ (idx)->reqid = (r); \ bcopy((s), &(idx)->src, ((const struct sockaddr *)(s))->sa_len); \ bcopy((d), &(idx)->dst, ((const struct sockaddr *)(d))->sa_len); \ } while (0) /* key statistics */ struct _keystat { u_long getspi_count; /* the avarage of count to try to get new SPI */ } keystat; struct sadb_msghdr { struct sadb_msg *msg; struct sadb_ext *ext[SADB_EXT_MAX + 1]; int extoff[SADB_EXT_MAX + 1]; int extlen[SADB_EXT_MAX + 1]; }; static struct secasvar *key_allocsa_policy __P((const struct secasindex *)); static void key_freesp_so __P((struct secpolicy **)); static struct secasvar *key_do_allocsa_policy __P((struct secashead *, u_int)); static void key_delsp __P((struct secpolicy *)); static struct secpolicy *key_getsp __P((struct secpolicyindex *)); static void _key_delsp(struct secpolicy *sp); static struct secpolicy *key_getspbyid __P((u_int32_t)); static u_int32_t key_newreqid __P((void)); static struct mbuf *key_gather_mbuf __P((struct mbuf *, const struct sadb_msghdr *, int, int, ...)); static int key_spdadd __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static u_int32_t key_getnewspid __P((void)); static int key_spddelete __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_spddelete2 __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_spdget __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_spdflush __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_spddump __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static struct mbuf *key_setdumpsp __P((struct secpolicy *, u_int8_t, u_int32_t, u_int32_t)); static u_int key_getspreqmsglen __P((struct secpolicy *)); static int key_spdexpire __P((struct secpolicy *)); static struct secashead *key_newsah __P((struct secasindex *)); static void key_delsah __P((struct secashead *)); static struct secasvar *key_newsav __P((struct mbuf *, const struct sadb_msghdr *, struct secashead *, int *, const char*, int)); #define KEY_NEWSAV(m, sadb, sah, e) \ key_newsav(m, sadb, sah, e, __FILE__, __LINE__) static void key_delsav __P((struct secasvar *)); static struct secashead *key_getsah __P((struct secasindex *)); static struct secasvar *key_checkspidup __P((struct secasindex *, u_int32_t)); static struct secasvar *key_getsavbyspi __P((struct secashead *, u_int32_t)); static int key_setsaval __P((struct secasvar *, struct mbuf *, const struct sadb_msghdr *)); static int key_mature __P((struct secasvar *)); static struct mbuf *key_setdumpsa __P((struct secasvar *, u_int8_t, u_int8_t, u_int32_t, u_int32_t)); static struct mbuf *key_setsadbmsg __P((u_int8_t, u_int16_t, u_int8_t, u_int32_t, pid_t, u_int16_t)); static struct mbuf *key_setsadbsa __P((struct secasvar *)); static struct mbuf *key_setsadbaddr __P((u_int16_t, const struct sockaddr *, u_int8_t, u_int16_t)); static struct mbuf *key_setsadbxsa2 __P((u_int8_t, u_int32_t, u_int32_t)); static struct mbuf *key_setsadbxpolicy __P((u_int16_t, u_int8_t, u_int32_t)); static struct seckey *key_dup_keymsg(const struct sadb_key *, u_int, struct malloc_type *); static struct seclifetime *key_dup_lifemsg(const struct sadb_lifetime *src, struct malloc_type *type); #ifdef INET6 static int key_ismyaddr6 __P((struct sockaddr_in6 *)); #endif /* flags for key_cmpsaidx() */ #define CMP_HEAD 1 /* protocol, addresses. */ #define CMP_MODE_REQID 2 /* additionally HEAD, reqid, mode. */ #define CMP_REQID 3 /* additionally HEAD, reaid. */ #define CMP_EXACTLY 4 /* all elements. */ static int key_cmpsaidx __P((const struct secasindex *, const struct secasindex *, int)); static int key_cmpspidx_exactly __P((struct secpolicyindex *, struct secpolicyindex *)); static int key_cmpspidx_withmask __P((struct secpolicyindex *, struct secpolicyindex *)); static int key_sockaddrcmp __P((const struct sockaddr *, const struct sockaddr *, int)); static int key_bbcmp __P((const void *, const void *, u_int)); static u_int16_t key_satype2proto __P((u_int8_t)); static u_int8_t key_proto2satype __P((u_int16_t)); static int key_getspi __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static u_int32_t key_do_getnewspi __P((struct sadb_spirange *, struct secasindex *)); static int key_update __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); #ifdef IPSEC_DOSEQCHECK static struct secasvar *key_getsavbyseq __P((struct secashead *, u_int32_t)); #endif static int key_add __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_setident __P((struct secashead *, struct mbuf *, const struct sadb_msghdr *)); static struct mbuf *key_getmsgbuf_x1 __P((struct mbuf *, const struct sadb_msghdr *)); static int key_delete __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_get __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static void key_getcomb_setlifetime __P((struct sadb_comb *)); static struct mbuf *key_getcomb_esp __P((void)); static struct mbuf *key_getcomb_ah __P((void)); static struct mbuf *key_getcomb_ipcomp __P((void)); static struct mbuf *key_getprop __P((const struct secasindex *)); static int key_acquire __P((const struct secasindex *, struct secpolicy *)); static struct secacq *key_newacq __P((const struct secasindex *)); static struct secacq *key_getacq __P((const struct secasindex *)); static struct secacq *key_getacqbyseq __P((u_int32_t)); static struct secspacq *key_newspacq __P((struct secpolicyindex *)); static struct secspacq *key_getspacq __P((struct secpolicyindex *)); static int key_acquire2 __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_register __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_expire __P((struct secasvar *)); static int key_flush __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_dump __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_promisc __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_senderror __P((struct socket *, struct mbuf *, int)); static int key_validate_ext __P((const struct sadb_ext *, int)); static int key_align __P((struct mbuf *, struct sadb_msghdr *)); static struct mbuf *key_setlifetime(struct seclifetime *src, u_int16_t exttype); static struct mbuf *key_setkey(struct seckey *src, u_int16_t exttype); #if 0 static const char *key_getfqdn __P((void)); static const char *key_getuserfqdn __P((void)); #endif static void key_sa_chgstate __P((struct secasvar *, u_int8_t)); static struct mbuf *key_alloc_mbuf __P((int)); static __inline void sa_initref(struct secasvar *sav) { refcount_init(&sav->refcnt, 1); } static __inline void sa_addref(struct secasvar *sav) { refcount_acquire(&sav->refcnt); IPSEC_ASSERT(sav->refcnt != 0, ("SA refcnt overflow")); } static __inline int sa_delref(struct secasvar *sav) { IPSEC_ASSERT(sav->refcnt > 0, ("SA refcnt underflow")); return (refcount_release(&sav->refcnt)); } #define SP_ADDREF(p) do { \ (p)->refcnt++; \ IPSEC_ASSERT((p)->refcnt != 0, ("SP refcnt overflow")); \ } while (0) #define SP_DELREF(p) do { \ IPSEC_ASSERT((p)->refcnt > 0, ("SP refcnt underflow")); \ (p)->refcnt--; \ } while (0) /* * Update the refcnt while holding the SPTREE lock. */ void key_addref(struct secpolicy *sp) { SPTREE_LOCK(); SP_ADDREF(sp); SPTREE_UNLOCK(); } /* * Return 0 when there are known to be no SP's for the specified * direction. Otherwise return 1. This is used by IPsec code * to optimize performance. */ int key_havesp(u_int dir) { INIT_VNET_IPSEC(curvnet); return (dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND ? LIST_FIRST(&V_sptree[dir]) != NULL : 1); } /* %%% IPsec policy management */ /* * allocating a SP for OUTBOUND or INBOUND packet. * Must call key_freesp() later. * OUT: NULL: not found * others: found and return the pointer. */ struct secpolicy * key_allocsp(struct secpolicyindex *spidx, u_int dir, const char* where, int tag) { INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; IPSEC_ASSERT(spidx != NULL, ("null spidx")); IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND, ("invalid direction %u", dir)); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s from %s:%u\n", __func__, where, tag)); /* get a SP entry */ KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("*** objects\n"); kdebug_secpolicyindex(spidx)); SPTREE_LOCK(); LIST_FOREACH(sp, &V_sptree[dir], chain) { KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("*** in SPD\n"); kdebug_secpolicyindex(&sp->spidx)); if (sp->state == IPSEC_SPSTATE_DEAD) continue; if (key_cmpspidx_withmask(&sp->spidx, spidx)) goto found; } sp = NULL; found: if (sp) { /* sanity check */ KEY_CHKSPDIR(sp->spidx.dir, dir, __func__); /* found a SPD entry */ sp->lastused = time_second; SP_ADDREF(sp); } SPTREE_UNLOCK(); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__, sp, sp ? sp->id : 0, sp ? sp->refcnt : 0)); return sp; } /* * allocating a SP for OUTBOUND or INBOUND packet. * Must call key_freesp() later. * OUT: NULL: not found * others: found and return the pointer. */ struct secpolicy * key_allocsp2(u_int32_t spi, union sockaddr_union *dst, u_int8_t proto, u_int dir, const char* where, int tag) { INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; IPSEC_ASSERT(dst != NULL, ("null dst")); IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND, ("invalid direction %u", dir)); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s from %s:%u\n", __func__, where, tag)); /* get a SP entry */ KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("*** objects\n"); printf("spi %u proto %u dir %u\n", spi, proto, dir); kdebug_sockaddr(&dst->sa)); SPTREE_LOCK(); LIST_FOREACH(sp, &V_sptree[dir], chain) { KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("*** in SPD\n"); kdebug_secpolicyindex(&sp->spidx)); if (sp->state == IPSEC_SPSTATE_DEAD) continue; /* compare simple values, then dst address */ if (sp->spidx.ul_proto != proto) continue; /* NB: spi's must exist and match */ if (!sp->req || !sp->req->sav || sp->req->sav->spi != spi) continue; if (key_sockaddrcmp(&sp->spidx.dst.sa, &dst->sa, 1) == 0) goto found; } sp = NULL; found: if (sp) { /* sanity check */ KEY_CHKSPDIR(sp->spidx.dir, dir, __func__); /* found a SPD entry */ sp->lastused = time_second; SP_ADDREF(sp); } SPTREE_UNLOCK(); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__, sp, sp ? sp->id : 0, sp ? sp->refcnt : 0)); return sp; } /* * return a policy that matches this particular inbound packet. * XXX slow */ struct secpolicy * key_gettunnel(const struct sockaddr *osrc, const struct sockaddr *odst, const struct sockaddr *isrc, const struct sockaddr *idst, const char* where, int tag) { INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; const int dir = IPSEC_DIR_INBOUND; struct ipsecrequest *r1, *r2, *p; struct secpolicyindex spidx; KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s from %s:%u\n", __func__, where, tag)); if (isrc->sa_family != idst->sa_family) { ipseclog((LOG_ERR, "%s: protocol family mismatched %d != %d\n.", __func__, isrc->sa_family, idst->sa_family)); sp = NULL; goto done; } SPTREE_LOCK(); LIST_FOREACH(sp, &V_sptree[dir], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) continue; r1 = r2 = NULL; for (p = sp->req; p; p = p->next) { if (p->saidx.mode != IPSEC_MODE_TUNNEL) continue; r1 = r2; r2 = p; if (!r1) { /* here we look at address matches only */ spidx = sp->spidx; if (isrc->sa_len > sizeof(spidx.src) || idst->sa_len > sizeof(spidx.dst)) continue; bcopy(isrc, &spidx.src, isrc->sa_len); bcopy(idst, &spidx.dst, idst->sa_len); if (!key_cmpspidx_withmask(&sp->spidx, &spidx)) continue; } else { if (key_sockaddrcmp(&r1->saidx.src.sa, isrc, 0) || key_sockaddrcmp(&r1->saidx.dst.sa, idst, 0)) continue; } if (key_sockaddrcmp(&r2->saidx.src.sa, osrc, 0) || key_sockaddrcmp(&r2->saidx.dst.sa, odst, 0)) continue; goto found; } } sp = NULL; found: if (sp) { sp->lastused = time_second; SP_ADDREF(sp); } SPTREE_UNLOCK(); done: KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__, sp, sp ? sp->id : 0, sp ? sp->refcnt : 0)); return sp; } /* * allocating an SA entry for an *OUTBOUND* packet. * checking each request entries in SP, and acquire an SA if need. * OUT: 0: there are valid requests. * ENOENT: policy may be valid, but SA with REQUIRE is on acquiring. */ int key_checkrequest(struct ipsecrequest *isr, const struct secasindex *saidx) { INIT_VNET_IPSEC(curvnet); u_int level; int error; IPSEC_ASSERT(isr != NULL, ("null isr")); IPSEC_ASSERT(saidx != NULL, ("null saidx")); IPSEC_ASSERT(saidx->mode == IPSEC_MODE_TRANSPORT || saidx->mode == IPSEC_MODE_TUNNEL, ("unexpected policy %u", saidx->mode)); /* * XXX guard against protocol callbacks from the crypto * thread as they reference ipsecrequest.sav which we * temporarily null out below. Need to rethink how we * handle bundled SA's in the callback thread. */ IPSECREQUEST_LOCK_ASSERT(isr); /* get current level */ level = ipsec_get_reqlevel(isr); #if 0 /* * We do allocate new SA only if the state of SA in the holder is * SADB_SASTATE_DEAD. The SA for outbound must be the oldest. */ if (isr->sav != NULL) { if (isr->sav->sah == NULL) panic("%s: sah is null.\n", __func__); if (isr->sav == (struct secasvar *)LIST_FIRST( &isr->sav->sah->savtree[SADB_SASTATE_DEAD])) { KEY_FREESAV(&isr->sav); isr->sav = NULL; } } #else /* * we free any SA stashed in the IPsec request because a different * SA may be involved each time this request is checked, either * because new SAs are being configured, or this request is * associated with an unconnected datagram socket, or this request * is associated with a system default policy. * * The operation may have negative impact to performance. We may * want to check cached SA carefully, rather than picking new SA * every time. */ if (isr->sav != NULL) { KEY_FREESAV(&isr->sav); isr->sav = NULL; } #endif /* * new SA allocation if no SA found. * key_allocsa_policy should allocate the oldest SA available. * See key_do_allocsa_policy(), and draft-jenkins-ipsec-rekeying-03.txt. */ if (isr->sav == NULL) isr->sav = key_allocsa_policy(saidx); /* When there is SA. */ if (isr->sav != NULL) { if (isr->sav->state != SADB_SASTATE_MATURE && isr->sav->state != SADB_SASTATE_DYING) return EINVAL; return 0; } /* there is no SA */ error = key_acquire(saidx, isr->sp); if (error != 0) { /* XXX What should I do ? */ ipseclog((LOG_DEBUG, "%s: error %d returned from key_acquire\n", __func__, error)); return error; } if (level != IPSEC_LEVEL_REQUIRE) { /* XXX sigh, the interface to this routine is botched */ IPSEC_ASSERT(isr->sav == NULL, ("unexpected SA")); return 0; } else { return ENOENT; } } /* * allocating a SA for policy entry from SAD. * NOTE: searching SAD of aliving state. * OUT: NULL: not found. * others: found and return the pointer. */ static struct secasvar * key_allocsa_policy(const struct secasindex *saidx) { #define N(a) _ARRAYLEN(a) INIT_VNET_IPSEC(curvnet); struct secashead *sah; struct secasvar *sav; u_int stateidx, arraysize; const u_int *state_valid; SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, saidx, CMP_MODE_REQID)) { if (V_key_preferred_oldsa) { state_valid = saorder_state_valid_prefer_old; arraysize = N(saorder_state_valid_prefer_old); } else { state_valid = saorder_state_valid_prefer_new; arraysize = N(saorder_state_valid_prefer_new); } SAHTREE_UNLOCK(); goto found; } } SAHTREE_UNLOCK(); return NULL; found: /* search valid state */ for (stateidx = 0; stateidx < arraysize; stateidx++) { sav = key_do_allocsa_policy(sah, state_valid[stateidx]); if (sav != NULL) return sav; } return NULL; #undef N } /* * searching SAD with direction, protocol, mode and state. * called by key_allocsa_policy(). * OUT: * NULL : not found * others : found, pointer to a SA. */ static struct secasvar * key_do_allocsa_policy(struct secashead *sah, u_int state) { INIT_VNET_IPSEC(curvnet); struct secasvar *sav, *nextsav, *candidate, *d; /* initilize */ candidate = NULL; SAHTREE_LOCK(); for (sav = LIST_FIRST(&sah->savtree[state]); sav != NULL; sav = nextsav) { nextsav = LIST_NEXT(sav, chain); /* sanity check */ KEY_CHKSASTATE(sav->state, state, __func__); /* initialize */ if (candidate == NULL) { candidate = sav; continue; } /* Which SA is the better ? */ IPSEC_ASSERT(candidate->lft_c != NULL, ("null candidate lifetime")); IPSEC_ASSERT(sav->lft_c != NULL, ("null sav lifetime")); /* What the best method is to compare ? */ if (V_key_preferred_oldsa) { if (candidate->lft_c->addtime > sav->lft_c->addtime) { candidate = sav; } continue; /*NOTREACHED*/ } /* preferred new sa rather than old sa */ if (candidate->lft_c->addtime < sav->lft_c->addtime) { d = candidate; candidate = sav; } else d = sav; /* * prepared to delete the SA when there is more * suitable candidate and the lifetime of the SA is not * permanent. */ if (d->lft_h->addtime != 0) { struct mbuf *m, *result; u_int8_t satype; key_sa_chgstate(d, SADB_SASTATE_DEAD); IPSEC_ASSERT(d->refcnt > 0, ("bogus ref count")); satype = key_proto2satype(d->sah->saidx.proto); if (satype == 0) goto msgfail; m = key_setsadbmsg(SADB_DELETE, 0, satype, 0, 0, d->refcnt - 1); if (!m) goto msgfail; result = m; /* set sadb_address for saidx's. */ m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, &d->sah->saidx.src.sa, d->sah->saidx.src.sa.sa_len << 3, IPSEC_ULPROTO_ANY); if (!m) goto msgfail; m_cat(result, m); /* set sadb_address for saidx's. */ m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, &d->sah->saidx.dst.sa, d->sah->saidx.dst.sa.sa_len << 3, IPSEC_ULPROTO_ANY); if (!m) goto msgfail; m_cat(result, m); /* create SA extension */ m = key_setsadbsa(d); if (!m) goto msgfail; m_cat(result, m); if (result->m_len < sizeof(struct sadb_msg)) { result = m_pullup(result, sizeof(struct sadb_msg)); if (result == NULL) goto msgfail; } result->m_pkthdr.len = 0; for (m = result; m; m = m->m_next) result->m_pkthdr.len += m->m_len; mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); if (key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED)) goto msgfail; msgfail: KEY_FREESAV(&d); } } if (candidate) { sa_addref(candidate); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s cause refcnt++:%d SA:%p\n", __func__, candidate->refcnt, candidate)); } SAHTREE_UNLOCK(); return candidate; } /* * allocating a usable SA entry for a *INBOUND* packet. * Must call key_freesav() later. * OUT: positive: pointer to a usable sav (i.e. MATURE or DYING state). * NULL: not found, or error occured. * * In the comparison, no source address is used--for RFC2401 conformance. * To quote, from section 4.1: * A security association is uniquely identified by a triple consisting * of a Security Parameter Index (SPI), an IP Destination Address, and a * security protocol (AH or ESP) identifier. * Note that, however, we do need to keep source address in IPsec SA. * IKE specification and PF_KEY specification do assume that we * keep source address in IPsec SA. We see a tricky situation here. */ struct secasvar * key_allocsa( union sockaddr_union *dst, u_int proto, u_int32_t spi, const char* where, int tag) { INIT_VNET_IPSEC(curvnet); struct secashead *sah; struct secasvar *sav; u_int stateidx, arraysize, state; const u_int *saorder_state_valid; IPSEC_ASSERT(dst != NULL, ("null dst address")); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s from %s:%u\n", __func__, where, tag)); /* * searching SAD. * XXX: to be checked internal IP header somewhere. Also when * IPsec tunnel packet is received. But ESP tunnel mode is * encrypted so we can't check internal IP header. */ SAHTREE_LOCK(); if (V_key_preferred_oldsa) { saorder_state_valid = saorder_state_valid_prefer_old; arraysize = _ARRAYLEN(saorder_state_valid_prefer_old); } else { saorder_state_valid = saorder_state_valid_prefer_new; arraysize = _ARRAYLEN(saorder_state_valid_prefer_new); } LIST_FOREACH(sah, &V_sahtree, chain) { /* search valid state */ for (stateidx = 0; stateidx < arraysize; stateidx++) { state = saorder_state_valid[stateidx]; LIST_FOREACH(sav, &sah->savtree[state], chain) { /* sanity check */ KEY_CHKSASTATE(sav->state, state, __func__); /* do not return entries w/ unusable state */ if (sav->state != SADB_SASTATE_MATURE && sav->state != SADB_SASTATE_DYING) continue; if (proto != sav->sah->saidx.proto) continue; if (spi != sav->spi) continue; #if 0 /* don't check src */ /* check src address */ if (key_sockaddrcmp(&src->sa, &sav->sah->saidx.src.sa, 0) != 0) continue; #endif /* check dst address */ if (key_sockaddrcmp(&dst->sa, &sav->sah->saidx.dst.sa, 0) != 0) continue; sa_addref(sav); goto done; } } } sav = NULL; done: SAHTREE_UNLOCK(); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s return SA:%p; refcnt %u\n", __func__, sav, sav ? sav->refcnt : 0)); return sav; } /* * Must be called after calling key_allocsp(). * For both the packet without socket and key_freeso(). */ void _key_freesp(struct secpolicy **spp, const char* where, int tag) { INIT_VNET_IPSEC(curvnet); struct secpolicy *sp = *spp; IPSEC_ASSERT(sp != NULL, ("null sp")); SPTREE_LOCK(); SP_DELREF(sp); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s SP:%p (ID=%u) from %s:%u; refcnt now %u\n", __func__, sp, sp->id, where, tag, sp->refcnt)); if (sp->refcnt == 0) { *spp = NULL; key_delsp(sp); } SPTREE_UNLOCK(); } /* * Must be called after calling key_allocsp(). * For the packet with socket. */ void key_freeso(struct socket *so) { INIT_VNET_IPSEC(curvnet); IPSEC_ASSERT(so != NULL, ("null so")); switch (so->so_proto->pr_domain->dom_family) { #ifdef INET case PF_INET: { struct inpcb *pcb = sotoinpcb(so); /* Does it have a PCB ? */ if (pcb == NULL) return; key_freesp_so(&pcb->inp_sp->sp_in); key_freesp_so(&pcb->inp_sp->sp_out); } break; #endif #ifdef INET6 case PF_INET6: { #ifdef HAVE_NRL_INPCB struct inpcb *pcb = sotoinpcb(so); /* Does it have a PCB ? */ if (pcb == NULL) return; key_freesp_so(&pcb->inp_sp->sp_in); key_freesp_so(&pcb->inp_sp->sp_out); #else struct in6pcb *pcb = sotoin6pcb(so); /* Does it have a PCB ? */ if (pcb == NULL) return; key_freesp_so(&pcb->in6p_sp->sp_in); key_freesp_so(&pcb->in6p_sp->sp_out); #endif } break; #endif /* INET6 */ default: ipseclog((LOG_DEBUG, "%s: unknown address family=%d.\n", __func__, so->so_proto->pr_domain->dom_family)); return; } } static void key_freesp_so(struct secpolicy **sp) { IPSEC_ASSERT(sp != NULL && *sp != NULL, ("null sp")); if ((*sp)->policy == IPSEC_POLICY_ENTRUST || (*sp)->policy == IPSEC_POLICY_BYPASS) return; IPSEC_ASSERT((*sp)->policy == IPSEC_POLICY_IPSEC, ("invalid policy %u", (*sp)->policy)); KEY_FREESP(sp); } /* * Must be called after calling key_allocsa(). * This function is called by key_freesp() to free some SA allocated * for a policy. */ void key_freesav(struct secasvar **psav, const char* where, int tag) { INIT_VNET_IPSEC(curvnet); struct secasvar *sav = *psav; IPSEC_ASSERT(sav != NULL, ("null sav")); if (sa_delref(sav)) { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s SA:%p (SPI %u) from %s:%u; refcnt now %u\n", __func__, sav, ntohl(sav->spi), where, tag, sav->refcnt)); *psav = NULL; key_delsav(sav); } else { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s SA:%p (SPI %u) from %s:%u; refcnt now %u\n", __func__, sav, ntohl(sav->spi), where, tag, sav->refcnt)); } } /* %%% SPD management */ /* * free security policy entry. */ static void key_delsp(struct secpolicy *sp) { struct ipsecrequest *isr, *nextisr; IPSEC_ASSERT(sp != NULL, ("null sp")); SPTREE_LOCK_ASSERT(); sp->state = IPSEC_SPSTATE_DEAD; IPSEC_ASSERT(sp->refcnt == 0, ("SP with references deleted (refcnt %u)", sp->refcnt)); /* remove from SP index */ if (__LIST_CHAINED(sp)) LIST_REMOVE(sp, chain); for (isr = sp->req; isr != NULL; isr = nextisr) { if (isr->sav != NULL) { KEY_FREESAV(&isr->sav); isr->sav = NULL; } nextisr = isr->next; ipsec_delisr(isr); } _key_delsp(sp); } /* * search SPD * OUT: NULL : not found * others : found, pointer to a SP. */ static struct secpolicy * key_getsp(struct secpolicyindex *spidx) { INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; IPSEC_ASSERT(spidx != NULL, ("null spidx")); SPTREE_LOCK(); LIST_FOREACH(sp, &V_sptree[spidx->dir], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) continue; if (key_cmpspidx_exactly(spidx, &sp->spidx)) { SP_ADDREF(sp); break; } } SPTREE_UNLOCK(); return sp; } /* * get SP by index. * OUT: NULL : not found * others : found, pointer to a SP. */ static struct secpolicy * key_getspbyid(u_int32_t id) { INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; SPTREE_LOCK(); LIST_FOREACH(sp, &V_sptree[IPSEC_DIR_INBOUND], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) continue; if (sp->id == id) { SP_ADDREF(sp); goto done; } } LIST_FOREACH(sp, &V_sptree[IPSEC_DIR_OUTBOUND], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) continue; if (sp->id == id) { SP_ADDREF(sp); goto done; } } done: SPTREE_UNLOCK(); return sp; } struct secpolicy * key_newsp(const char* where, int tag) { INIT_VNET_IPSEC(curvnet); struct secpolicy *newsp = NULL; newsp = (struct secpolicy *) malloc(sizeof(struct secpolicy), M_IPSEC_SP, M_NOWAIT|M_ZERO); if (newsp) { SECPOLICY_LOCK_INIT(newsp); newsp->refcnt = 1; newsp->req = NULL; } KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s from %s:%u return SP:%p\n", __func__, where, tag, newsp)); return newsp; } static void _key_delsp(struct secpolicy *sp) { SECPOLICY_LOCK_DESTROY(sp); free(sp, M_IPSEC_SP); } /* * create secpolicy structure from sadb_x_policy structure. * NOTE: `state', `secpolicyindex' in secpolicy structure are not set, * so must be set properly later. */ struct secpolicy * key_msg2sp(xpl0, len, error) struct sadb_x_policy *xpl0; size_t len; int *error; { INIT_VNET_IPSEC(curvnet); struct secpolicy *newsp; IPSEC_ASSERT(xpl0 != NULL, ("null xpl0")); IPSEC_ASSERT(len >= sizeof(*xpl0), ("policy too short: %zu", len)); if (len != PFKEY_EXTLEN(xpl0)) { ipseclog((LOG_DEBUG, "%s: Invalid msg length.\n", __func__)); *error = EINVAL; return NULL; } if ((newsp = KEY_NEWSP()) == NULL) { *error = ENOBUFS; return NULL; } newsp->spidx.dir = xpl0->sadb_x_policy_dir; newsp->policy = xpl0->sadb_x_policy_type; /* check policy */ switch (xpl0->sadb_x_policy_type) { case IPSEC_POLICY_DISCARD: case IPSEC_POLICY_NONE: case IPSEC_POLICY_ENTRUST: case IPSEC_POLICY_BYPASS: newsp->req = NULL; break; case IPSEC_POLICY_IPSEC: { int tlen; struct sadb_x_ipsecrequest *xisr; struct ipsecrequest **p_isr = &newsp->req; /* validity check */ if (PFKEY_EXTLEN(xpl0) < sizeof(*xpl0)) { ipseclog((LOG_DEBUG, "%s: Invalid msg length.\n", __func__)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } tlen = PFKEY_EXTLEN(xpl0) - sizeof(*xpl0); xisr = (struct sadb_x_ipsecrequest *)(xpl0 + 1); while (tlen > 0) { /* length check */ if (xisr->sadb_x_ipsecrequest_len < sizeof(*xisr)) { ipseclog((LOG_DEBUG, "%s: invalid ipsecrequest " "length.\n", __func__)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } /* allocate request buffer */ /* NB: data structure is zero'd */ *p_isr = ipsec_newisr(); if ((*p_isr) == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); KEY_FREESP(&newsp); *error = ENOBUFS; return NULL; } /* set values */ switch (xisr->sadb_x_ipsecrequest_proto) { case IPPROTO_ESP: case IPPROTO_AH: case IPPROTO_IPCOMP: break; default: ipseclog((LOG_DEBUG, "%s: invalid proto type=%u\n", __func__, xisr->sadb_x_ipsecrequest_proto)); KEY_FREESP(&newsp); *error = EPROTONOSUPPORT; return NULL; } (*p_isr)->saidx.proto = xisr->sadb_x_ipsecrequest_proto; switch (xisr->sadb_x_ipsecrequest_mode) { case IPSEC_MODE_TRANSPORT: case IPSEC_MODE_TUNNEL: break; case IPSEC_MODE_ANY: default: ipseclog((LOG_DEBUG, "%s: invalid mode=%u\n", __func__, xisr->sadb_x_ipsecrequest_mode)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } (*p_isr)->saidx.mode = xisr->sadb_x_ipsecrequest_mode; switch (xisr->sadb_x_ipsecrequest_level) { case IPSEC_LEVEL_DEFAULT: case IPSEC_LEVEL_USE: case IPSEC_LEVEL_REQUIRE: break; case IPSEC_LEVEL_UNIQUE: /* validity check */ /* * If range violation of reqid, kernel will * update it, don't refuse it. */ if (xisr->sadb_x_ipsecrequest_reqid > IPSEC_MANUAL_REQID_MAX) { ipseclog((LOG_DEBUG, "%s: reqid=%d range " "violation, updated by kernel.\n", __func__, xisr->sadb_x_ipsecrequest_reqid)); xisr->sadb_x_ipsecrequest_reqid = 0; } /* allocate new reqid id if reqid is zero. */ if (xisr->sadb_x_ipsecrequest_reqid == 0) { u_int32_t reqid; if ((reqid = key_newreqid()) == 0) { KEY_FREESP(&newsp); *error = ENOBUFS; return NULL; } (*p_isr)->saidx.reqid = reqid; xisr->sadb_x_ipsecrequest_reqid = reqid; } else { /* set it for manual keying. */ (*p_isr)->saidx.reqid = xisr->sadb_x_ipsecrequest_reqid; } break; default: ipseclog((LOG_DEBUG, "%s: invalid level=%u\n", __func__, xisr->sadb_x_ipsecrequest_level)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } (*p_isr)->level = xisr->sadb_x_ipsecrequest_level; /* set IP addresses if there */ if (xisr->sadb_x_ipsecrequest_len > sizeof(*xisr)) { struct sockaddr *paddr; paddr = (struct sockaddr *)(xisr + 1); /* validity check */ if (paddr->sa_len > sizeof((*p_isr)->saidx.src)) { ipseclog((LOG_DEBUG, "%s: invalid " "request address length.\n", __func__)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } bcopy(paddr, &(*p_isr)->saidx.src, paddr->sa_len); paddr = (struct sockaddr *)((caddr_t)paddr + paddr->sa_len); /* validity check */ if (paddr->sa_len > sizeof((*p_isr)->saidx.dst)) { ipseclog((LOG_DEBUG, "%s: invalid " "request address length.\n", __func__)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } bcopy(paddr, &(*p_isr)->saidx.dst, paddr->sa_len); } (*p_isr)->sp = newsp; /* initialization for the next. */ p_isr = &(*p_isr)->next; tlen -= xisr->sadb_x_ipsecrequest_len; /* validity check */ if (tlen < 0) { ipseclog((LOG_DEBUG, "%s: becoming tlen < 0.\n", __func__)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } xisr = (struct sadb_x_ipsecrequest *)((caddr_t)xisr + xisr->sadb_x_ipsecrequest_len); } } break; default: ipseclog((LOG_DEBUG, "%s: invalid policy type.\n", __func__)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } *error = 0; return newsp; } static u_int32_t key_newreqid() { static u_int32_t auto_reqid = IPSEC_MANUAL_REQID_MAX + 1; auto_reqid = (auto_reqid == ~0 ? IPSEC_MANUAL_REQID_MAX + 1 : auto_reqid + 1); /* XXX should be unique check */ return auto_reqid; } /* * copy secpolicy struct to sadb_x_policy structure indicated. */ struct mbuf * key_sp2msg(sp) struct secpolicy *sp; { struct sadb_x_policy *xpl; int tlen; caddr_t p; struct mbuf *m; IPSEC_ASSERT(sp != NULL, ("null policy")); tlen = key_getspreqmsglen(sp); m = key_alloc_mbuf(tlen); if (!m || m->m_next) { /*XXX*/ if (m) m_freem(m); return NULL; } m->m_len = tlen; m->m_next = NULL; xpl = mtod(m, struct sadb_x_policy *); bzero(xpl, tlen); xpl->sadb_x_policy_len = PFKEY_UNIT64(tlen); xpl->sadb_x_policy_exttype = SADB_X_EXT_POLICY; xpl->sadb_x_policy_type = sp->policy; xpl->sadb_x_policy_dir = sp->spidx.dir; xpl->sadb_x_policy_id = sp->id; p = (caddr_t)xpl + sizeof(*xpl); /* if is the policy for ipsec ? */ if (sp->policy == IPSEC_POLICY_IPSEC) { struct sadb_x_ipsecrequest *xisr; struct ipsecrequest *isr; for (isr = sp->req; isr != NULL; isr = isr->next) { xisr = (struct sadb_x_ipsecrequest *)p; xisr->sadb_x_ipsecrequest_proto = isr->saidx.proto; xisr->sadb_x_ipsecrequest_mode = isr->saidx.mode; xisr->sadb_x_ipsecrequest_level = isr->level; xisr->sadb_x_ipsecrequest_reqid = isr->saidx.reqid; p += sizeof(*xisr); bcopy(&isr->saidx.src, p, isr->saidx.src.sa.sa_len); p += isr->saidx.src.sa.sa_len; bcopy(&isr->saidx.dst, p, isr->saidx.dst.sa.sa_len); p += isr->saidx.src.sa.sa_len; xisr->sadb_x_ipsecrequest_len = PFKEY_ALIGN8(sizeof(*xisr) + isr->saidx.src.sa.sa_len + isr->saidx.dst.sa.sa_len); } } return m; } /* m will not be freed nor modified */ static struct mbuf * #ifdef __STDC__ key_gather_mbuf(struct mbuf *m, const struct sadb_msghdr *mhp, int ndeep, int nitem, ...) #else key_gather_mbuf(m, mhp, ndeep, nitem, va_alist) struct mbuf *m; const struct sadb_msghdr *mhp; int ndeep; int nitem; va_dcl #endif { va_list ap; int idx; int i; struct mbuf *result = NULL, *n; int len; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); va_start(ap, nitem); for (i = 0; i < nitem; i++) { idx = va_arg(ap, int); if (idx < 0 || idx > SADB_EXT_MAX) goto fail; /* don't attempt to pull empty extension */ if (idx == SADB_EXT_RESERVED && mhp->msg == NULL) continue; if (idx != SADB_EXT_RESERVED && (mhp->ext[idx] == NULL || mhp->extlen[idx] == 0)) continue; if (idx == SADB_EXT_RESERVED) { len = PFKEY_ALIGN8(sizeof(struct sadb_msg)); IPSEC_ASSERT(len <= MHLEN, ("header too big %u", len)); MGETHDR(n, M_DONTWAIT, MT_DATA); if (!n) goto fail; n->m_len = len; n->m_next = NULL; m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t)); } else if (i < ndeep) { len = mhp->extlen[idx]; n = key_alloc_mbuf(len); if (!n || n->m_next) { /*XXX*/ if (n) m_freem(n); goto fail; } m_copydata(m, mhp->extoff[idx], mhp->extlen[idx], mtod(n, caddr_t)); } else { n = m_copym(m, mhp->extoff[idx], mhp->extlen[idx], M_DONTWAIT); } if (n == NULL) goto fail; if (result) m_cat(result, n); else result = n; } va_end(ap); if ((result->m_flags & M_PKTHDR) != 0) { result->m_pkthdr.len = 0; for (n = result; n; n = n->m_next) result->m_pkthdr.len += n->m_len; } return result; fail: m_freem(result); return NULL; } /* * SADB_X_SPDADD, SADB_X_SPDSETIDX or SADB_X_SPDUPDATE processing * add an entry to SP database, when received * * from the user(?). * Adding to SP database, * and send * * to the socket which was send. * * SPDADD set a unique policy entry. * SPDSETIDX like SPDADD without a part of policy requests. * SPDUPDATE replace a unique policy entry. * * m will always be freed. */ static int key_spdadd(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); struct sadb_address *src0, *dst0; struct sadb_x_policy *xpl0, *xpl; struct sadb_lifetime *lft = NULL; struct secpolicyindex spidx; struct secpolicy *newsp; int error; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL || mhp->ext[SADB_X_EXT_POLICY] == NULL) { ipseclog((LOG_DEBUG, "key_spdadd: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) || mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL) { if (mhp->extlen[SADB_EXT_LIFETIME_HARD] < sizeof(struct sadb_lifetime)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } lft = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD]; } src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC]; dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST]; xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY]; /* make secindex */ /* XXX boundary check against sa_len */ KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir, src0 + 1, dst0 + 1, src0->sadb_address_prefixlen, dst0->sadb_address_prefixlen, src0->sadb_address_proto, &spidx); /* checking the direciton. */ switch (xpl0->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: case IPSEC_DIR_OUTBOUND: break; default: ipseclog((LOG_DEBUG, "%s: Invalid SP direction.\n", __func__)); mhp->msg->sadb_msg_errno = EINVAL; return 0; } /* check policy */ /* key_spdadd() accepts DISCARD, NONE and IPSEC. */ if (xpl0->sadb_x_policy_type == IPSEC_POLICY_ENTRUST || xpl0->sadb_x_policy_type == IPSEC_POLICY_BYPASS) { ipseclog((LOG_DEBUG, "%s: Invalid policy type.\n", __func__)); return key_senderror(so, m, EINVAL); } /* policy requests are mandatory when action is ipsec. */ if (mhp->msg->sadb_msg_type != SADB_X_SPDSETIDX && xpl0->sadb_x_policy_type == IPSEC_POLICY_IPSEC && mhp->extlen[SADB_X_EXT_POLICY] <= sizeof(*xpl0)) { ipseclog((LOG_DEBUG, "%s: some policy requests part required\n", __func__)); return key_senderror(so, m, EINVAL); } /* * checking there is SP already or not. * SPDUPDATE doesn't depend on whether there is a SP or not. * If the type is either SPDADD or SPDSETIDX AND a SP is found, * then error. */ newsp = key_getsp(&spidx); if (mhp->msg->sadb_msg_type == SADB_X_SPDUPDATE) { if (newsp) { newsp->state = IPSEC_SPSTATE_DEAD; KEY_FREESP(&newsp); } } else { if (newsp != NULL) { KEY_FREESP(&newsp); ipseclog((LOG_DEBUG, "%s: a SP entry exists already.\n", __func__)); return key_senderror(so, m, EEXIST); } } /* allocation new SP entry */ if ((newsp = key_msg2sp(xpl0, PFKEY_EXTLEN(xpl0), &error)) == NULL) { return key_senderror(so, m, error); } if ((newsp->id = key_getnewspid()) == 0) { _key_delsp(newsp); return key_senderror(so, m, ENOBUFS); } /* XXX boundary check against sa_len */ KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir, src0 + 1, dst0 + 1, src0->sadb_address_prefixlen, dst0->sadb_address_prefixlen, src0->sadb_address_proto, &newsp->spidx); /* sanity check on addr pair */ if (((struct sockaddr *)(src0 + 1))->sa_family != ((struct sockaddr *)(dst0+ 1))->sa_family) { _key_delsp(newsp); return key_senderror(so, m, EINVAL); } if (((struct sockaddr *)(src0 + 1))->sa_len != ((struct sockaddr *)(dst0+ 1))->sa_len) { _key_delsp(newsp); return key_senderror(so, m, EINVAL); } #if 1 if (newsp->req && newsp->req->saidx.src.sa.sa_family) { struct sockaddr *sa; sa = (struct sockaddr *)(src0 + 1); if (sa->sa_family != newsp->req->saidx.src.sa.sa_family) { _key_delsp(newsp); return key_senderror(so, m, EINVAL); } } if (newsp->req && newsp->req->saidx.dst.sa.sa_family) { struct sockaddr *sa; sa = (struct sockaddr *)(dst0 + 1); if (sa->sa_family != newsp->req->saidx.dst.sa.sa_family) { _key_delsp(newsp); return key_senderror(so, m, EINVAL); } } #endif newsp->created = time_second; newsp->lastused = newsp->created; newsp->lifetime = lft ? lft->sadb_lifetime_addtime : 0; newsp->validtime = lft ? lft->sadb_lifetime_usetime : 0; newsp->refcnt = 1; /* do not reclaim until I say I do */ newsp->state = IPSEC_SPSTATE_ALIVE; LIST_INSERT_TAIL(&V_sptree[newsp->spidx.dir], newsp, secpolicy, chain); /* delete the entry in spacqtree */ if (mhp->msg->sadb_msg_type == SADB_X_SPDUPDATE) { struct secspacq *spacq = key_getspacq(&spidx); if (spacq != NULL) { /* reset counter in order to deletion by timehandler. */ spacq->created = time_second; spacq->count = 0; SPACQ_UNLOCK(); } } { struct mbuf *n, *mpolicy; struct sadb_msg *newmsg; int off; /* create new sadb_msg to reply. */ if (lft) { n = key_gather_mbuf(m, mhp, 2, 5, SADB_EXT_RESERVED, SADB_X_EXT_POLICY, SADB_EXT_LIFETIME_HARD, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST); } else { n = key_gather_mbuf(m, mhp, 2, 4, SADB_EXT_RESERVED, SADB_X_EXT_POLICY, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST); } if (!n) return key_senderror(so, m, ENOBUFS); if (n->m_len < sizeof(*newmsg)) { n = m_pullup(n, sizeof(*newmsg)); if (!n) return key_senderror(so, m, ENOBUFS); } newmsg = mtod(n, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len); off = 0; mpolicy = m_pulldown(n, PFKEY_ALIGN8(sizeof(struct sadb_msg)), sizeof(*xpl), &off); if (mpolicy == NULL) { /* n is already freed */ return key_senderror(so, m, ENOBUFS); } xpl = (struct sadb_x_policy *)(mtod(mpolicy, caddr_t) + off); if (xpl->sadb_x_policy_exttype != SADB_X_EXT_POLICY) { m_freem(n); return key_senderror(so, m, EINVAL); } xpl->sadb_x_policy_id = newsp->id; m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ALL); } } /* * get new policy id. * OUT: * 0: failure. * others: success. */ static u_int32_t key_getnewspid() { INIT_VNET_IPSEC(curvnet); u_int32_t newid = 0; int count = V_key_spi_trycnt; /* XXX */ struct secpolicy *sp; /* when requesting to allocate spi ranged */ while (count--) { newid = (V_policy_id = (V_policy_id == ~0 ? 1 : V_policy_id + 1)); if ((sp = key_getspbyid(newid)) == NULL) break; KEY_FREESP(&sp); } if (count == 0 || newid == 0) { ipseclog((LOG_DEBUG, "%s: to allocate policy id is failed.\n", __func__)); return 0; } return newid; } /* * SADB_SPDDELETE processing * receive * * from the user(?), and set SADB_SASTATE_DEAD, * and send, * * to the ikmpd. * policy(*) including direction of policy. * * m will always be freed. */ static int key_spddelete(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); struct sadb_address *src0, *dst0; struct sadb_x_policy *xpl0; struct secpolicyindex spidx; struct secpolicy *sp; IPSEC_ASSERT(so != NULL, ("null so")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL || mhp->ext[SADB_X_EXT_POLICY] == NULL) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) || mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC]; dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST]; xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY]; /* make secindex */ /* XXX boundary check against sa_len */ KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir, src0 + 1, dst0 + 1, src0->sadb_address_prefixlen, dst0->sadb_address_prefixlen, src0->sadb_address_proto, &spidx); /* checking the direciton. */ switch (xpl0->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: case IPSEC_DIR_OUTBOUND: break; default: ipseclog((LOG_DEBUG, "%s: Invalid SP direction.\n", __func__)); return key_senderror(so, m, EINVAL); } /* Is there SP in SPD ? */ if ((sp = key_getsp(&spidx)) == NULL) { ipseclog((LOG_DEBUG, "%s: no SP found.\n", __func__)); return key_senderror(so, m, EINVAL); } /* save policy id to buffer to be returned. */ xpl0->sadb_x_policy_id = sp->id; sp->state = IPSEC_SPSTATE_DEAD; KEY_FREESP(&sp); { struct mbuf *n; struct sadb_msg *newmsg; /* create new sadb_msg to reply. */ n = key_gather_mbuf(m, mhp, 1, 4, SADB_EXT_RESERVED, SADB_X_EXT_POLICY, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST); if (!n) return key_senderror(so, m, ENOBUFS); newmsg = mtod(n, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len); m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ALL); } } /* * SADB_SPDDELETE2 processing * receive * * from the user(?), and set SADB_SASTATE_DEAD, * and send, * * to the ikmpd. * policy(*) including direction of policy. * * m will always be freed. */ static int key_spddelete2(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); u_int32_t id; struct secpolicy *sp; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); if (mhp->ext[SADB_X_EXT_POLICY] == NULL || mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } id = ((struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id; /* Is there SP in SPD ? */ if ((sp = key_getspbyid(id)) == NULL) { ipseclog((LOG_DEBUG, "%s: no SP found id:%u.\n", __func__, id)); return key_senderror(so, m, EINVAL); } sp->state = IPSEC_SPSTATE_DEAD; KEY_FREESP(&sp); { struct mbuf *n, *nn; struct sadb_msg *newmsg; int off, len; /* create new sadb_msg to reply. */ len = PFKEY_ALIGN8(sizeof(struct sadb_msg)); MGETHDR(n, M_DONTWAIT, MT_DATA); if (n && len > MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; } } if (!n) return key_senderror(so, m, ENOBUFS); n->m_len = len; n->m_next = NULL; off = 0; m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off); off += PFKEY_ALIGN8(sizeof(struct sadb_msg)); IPSEC_ASSERT(off == len, ("length inconsistency (off %u len %u)", off, len)); n->m_next = m_copym(m, mhp->extoff[SADB_X_EXT_POLICY], mhp->extlen[SADB_X_EXT_POLICY], M_DONTWAIT); if (!n->m_next) { m_freem(n); return key_senderror(so, m, ENOBUFS); } n->m_pkthdr.len = 0; for (nn = n; nn; nn = nn->m_next) n->m_pkthdr.len += nn->m_len; newmsg = mtod(n, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len); m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ALL); } } /* * SADB_X_GET processing * receive * * from the user(?), * and send, * * to the ikmpd. * policy(*) including direction of policy. * * m will always be freed. */ static int key_spdget(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); u_int32_t id; struct secpolicy *sp; struct mbuf *n; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); if (mhp->ext[SADB_X_EXT_POLICY] == NULL || mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } id = ((struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id; /* Is there SP in SPD ? */ if ((sp = key_getspbyid(id)) == NULL) { ipseclog((LOG_DEBUG, "%s: no SP found id:%u.\n", __func__, id)); return key_senderror(so, m, ENOENT); } n = key_setdumpsp(sp, SADB_X_SPDGET, 0, mhp->msg->sadb_msg_pid); if (n != NULL) { m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ONE); } else return key_senderror(so, m, ENOBUFS); } /* * SADB_X_SPDACQUIRE processing. * Acquire policy and SA(s) for a *OUTBOUND* packet. * send * * to KMD, and expect to receive * with SADB_X_SPDACQUIRE if error occured, * or * * with SADB_X_SPDUPDATE from KMD by PF_KEY. * policy(*) is without policy requests. * * 0 : succeed * others: error number */ int key_spdacquire(sp) struct secpolicy *sp; { INIT_VNET_IPSEC(curvnet); struct mbuf *result = NULL, *m; struct secspacq *newspacq; IPSEC_ASSERT(sp != NULL, ("null secpolicy")); IPSEC_ASSERT(sp->req == NULL, ("policy exists")); IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC, ("policy not IPSEC %u", sp->policy)); /* Get an entry to check whether sent message or not. */ newspacq = key_getspacq(&sp->spidx); if (newspacq != NULL) { if (V_key_blockacq_count < newspacq->count) { /* reset counter and do send message. */ newspacq->count = 0; } else { /* increment counter and do nothing. */ newspacq->count++; return 0; } SPACQ_UNLOCK(); } else { /* make new entry for blocking to send SADB_ACQUIRE. */ newspacq = key_newspacq(&sp->spidx); if (newspacq == NULL) return ENOBUFS; } /* create new sadb_msg to reply. */ m = key_setsadbmsg(SADB_X_SPDACQUIRE, 0, 0, 0, 0, 0); if (!m) return ENOBUFS; result = m; result->m_pkthdr.len = 0; for (m = result; m; m = m->m_next) result->m_pkthdr.len += m->m_len; mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); return key_sendup_mbuf(NULL, m, KEY_SENDUP_REGISTERED); } /* * SADB_SPDFLUSH processing * receive * * from the user, and free all entries in secpctree. * and send, * * to the user. * NOTE: what to do is only marking SADB_SASTATE_DEAD. * * m will always be freed. */ static int key_spdflush(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); struct sadb_msg *newmsg; struct secpolicy *sp; u_int dir; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); if (m->m_len != PFKEY_ALIGN8(sizeof(struct sadb_msg))) return key_senderror(so, m, EINVAL); for (dir = 0; dir < IPSEC_DIR_MAX; dir++) { SPTREE_LOCK(); LIST_FOREACH(sp, &V_sptree[dir], chain) sp->state = IPSEC_SPSTATE_DEAD; SPTREE_UNLOCK(); } if (sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return key_senderror(so, m, ENOBUFS); } if (m->m_next) m_freem(m->m_next); m->m_next = NULL; m->m_pkthdr.len = m->m_len = PFKEY_ALIGN8(sizeof(struct sadb_msg)); newmsg = mtod(m, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len); return key_sendup_mbuf(so, m, KEY_SENDUP_ALL); } /* * SADB_SPDDUMP processing * receive * * from the user, and dump all SP leaves * and send, * ..... * to the ikmpd. * * m will always be freed. */ static int key_spddump(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; int cnt; u_int dir; struct mbuf *n; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* search SPD entry and get buffer size. */ cnt = 0; for (dir = 0; dir < IPSEC_DIR_MAX; dir++) { LIST_FOREACH(sp, &V_sptree[dir], chain) { cnt++; } } if (cnt == 0) return key_senderror(so, m, ENOENT); for (dir = 0; dir < IPSEC_DIR_MAX; dir++) { LIST_FOREACH(sp, &V_sptree[dir], chain) { --cnt; n = key_setdumpsp(sp, SADB_X_SPDDUMP, cnt, mhp->msg->sadb_msg_pid); if (n) key_sendup_mbuf(so, n, KEY_SENDUP_ONE); } } m_freem(m); return 0; } static struct mbuf * key_setdumpsp(sp, type, seq, pid) struct secpolicy *sp; u_int8_t type; u_int32_t seq, pid; { struct mbuf *result = NULL, *m; struct seclifetime lt; m = key_setsadbmsg(type, 0, SADB_SATYPE_UNSPEC, seq, pid, sp->refcnt); if (!m) goto fail; result = m; m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, &sp->spidx.src.sa, sp->spidx.prefs, sp->spidx.ul_proto); if (!m) goto fail; m_cat(result, m); m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, &sp->spidx.dst.sa, sp->spidx.prefd, sp->spidx.ul_proto); if (!m) goto fail; m_cat(result, m); m = key_sp2msg(sp); if (!m) goto fail; m_cat(result, m); if(sp->lifetime){ lt.addtime=sp->created; lt.usetime= sp->lastused; m = key_setlifetime(<, SADB_EXT_LIFETIME_CURRENT); if (!m) goto fail; m_cat(result, m); lt.addtime=sp->lifetime; lt.usetime= sp->validtime; m = key_setlifetime(<, SADB_EXT_LIFETIME_HARD); if (!m) goto fail; m_cat(result, m); } if ((result->m_flags & M_PKTHDR) == 0) goto fail; if (result->m_len < sizeof(struct sadb_msg)) { result = m_pullup(result, sizeof(struct sadb_msg)); if (result == NULL) goto fail; } result->m_pkthdr.len = 0; for (m = result; m; m = m->m_next) result->m_pkthdr.len += m->m_len; mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); return result; fail: m_freem(result); return NULL; } /* * get PFKEY message length for security policy and request. */ static u_int key_getspreqmsglen(sp) struct secpolicy *sp; { u_int tlen; tlen = sizeof(struct sadb_x_policy); /* if is the policy for ipsec ? */ if (sp->policy != IPSEC_POLICY_IPSEC) return tlen; /* get length of ipsec requests */ { struct ipsecrequest *isr; int len; for (isr = sp->req; isr != NULL; isr = isr->next) { len = sizeof(struct sadb_x_ipsecrequest) + isr->saidx.src.sa.sa_len + isr->saidx.dst.sa.sa_len; tlen += PFKEY_ALIGN8(len); } } return tlen; } /* * SADB_SPDEXPIRE processing * send * * to KMD by PF_KEY. * * OUT: 0 : succeed * others : error number */ static int key_spdexpire(sp) struct secpolicy *sp; { struct mbuf *result = NULL, *m; int len; int error = -1; struct sadb_lifetime *lt; /* XXX: Why do we lock ? */ IPSEC_ASSERT(sp != NULL, ("null secpolicy")); /* set msg header */ m = key_setsadbmsg(SADB_X_SPDEXPIRE, 0, 0, 0, 0, 0); if (!m) { error = ENOBUFS; goto fail; } result = m; /* create lifetime extension (current and hard) */ len = PFKEY_ALIGN8(sizeof(*lt)) * 2; m = key_alloc_mbuf(len); if (!m || m->m_next) { /*XXX*/ if (m) m_freem(m); error = ENOBUFS; goto fail; } bzero(mtod(m, caddr_t), len); lt = mtod(m, struct sadb_lifetime *); lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime)); lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; lt->sadb_lifetime_allocations = 0; lt->sadb_lifetime_bytes = 0; lt->sadb_lifetime_addtime = sp->created; lt->sadb_lifetime_usetime = sp->lastused; lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2); lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime)); lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; lt->sadb_lifetime_allocations = 0; lt->sadb_lifetime_bytes = 0; lt->sadb_lifetime_addtime = sp->lifetime; lt->sadb_lifetime_usetime = sp->validtime; m_cat(result, m); /* set sadb_address for source */ m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, &sp->spidx.src.sa, sp->spidx.prefs, sp->spidx.ul_proto); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); /* set sadb_address for destination */ m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, &sp->spidx.dst.sa, sp->spidx.prefd, sp->spidx.ul_proto); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); /* set secpolicy */ m = key_sp2msg(sp); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); if ((result->m_flags & M_PKTHDR) == 0) { error = EINVAL; goto fail; } if (result->m_len < sizeof(struct sadb_msg)) { result = m_pullup(result, sizeof(struct sadb_msg)); if (result == NULL) { error = ENOBUFS; goto fail; } } result->m_pkthdr.len = 0; for (m = result; m; m = m->m_next) result->m_pkthdr.len += m->m_len; mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED); fail: if (result) m_freem(result); return error; } /* %%% SAD management */ /* * allocating a memory for new SA head, and copy from the values of mhp. * OUT: NULL : failure due to the lack of memory. * others : pointer to new SA head. */ static struct secashead * key_newsah(saidx) struct secasindex *saidx; { INIT_VNET_IPSEC(curvnet); struct secashead *newsah; IPSEC_ASSERT(saidx != NULL, ("null saidx")); newsah = malloc(sizeof(struct secashead), M_IPSEC_SAH, M_NOWAIT|M_ZERO); if (newsah != NULL) { int i; for (i = 0; i < sizeof(newsah->savtree)/sizeof(newsah->savtree[0]); i++) LIST_INIT(&newsah->savtree[i]); newsah->saidx = *saidx; /* add to saidxtree */ newsah->state = SADB_SASTATE_MATURE; SAHTREE_LOCK(); LIST_INSERT_HEAD(&V_sahtree, newsah, chain); SAHTREE_UNLOCK(); } return(newsah); } /* * delete SA index and all SA registerd. */ static void key_delsah(sah) struct secashead *sah; { INIT_VNET_IPSEC(curvnet); struct secasvar *sav, *nextsav; u_int stateidx; int zombie = 0; IPSEC_ASSERT(sah != NULL, ("NULL sah")); SAHTREE_LOCK_ASSERT(); /* searching all SA registerd in the secindex. */ for (stateidx = 0; stateidx < _ARRAYLEN(V_saorder_state_any); stateidx++) { u_int state = V_saorder_state_any[stateidx]; LIST_FOREACH_SAFE(sav, &sah->savtree[state], chain, nextsav) { if (sav->refcnt == 0) { /* sanity check */ KEY_CHKSASTATE(state, sav->state, __func__); KEY_FREESAV(&sav); } else { /* give up to delete this sa */ zombie++; } } } if (!zombie) { /* delete only if there are savs */ /* remove from tree of SA index */ if (__LIST_CHAINED(sah)) LIST_REMOVE(sah, chain); if (sah->sa_route.ro_rt) { RTFREE(sah->sa_route.ro_rt); sah->sa_route.ro_rt = (struct rtentry *)NULL; } free(sah, M_IPSEC_SAH); } } /* * allocating a new SA with LARVAL state. key_add() and key_getspi() call, * and copy the values of mhp into new buffer. * When SAD message type is GETSPI: * to set sequence number from acq_seq++, * to set zero to SPI. * not to call key_setsava(). * OUT: NULL : fail * others : pointer to new secasvar. * * does not modify mbuf. does not free mbuf on error. */ static struct secasvar * key_newsav(m, mhp, sah, errp, where, tag) struct mbuf *m; const struct sadb_msghdr *mhp; struct secashead *sah; int *errp; const char* where; int tag; { INIT_VNET_IPSEC(curvnet); struct secasvar *newsav; const struct sadb_sa *xsa; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); IPSEC_ASSERT(sah != NULL, ("null secashead")); newsav = malloc(sizeof(struct secasvar), M_IPSEC_SA, M_NOWAIT|M_ZERO); if (newsav == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); *errp = ENOBUFS; goto done; } switch (mhp->msg->sadb_msg_type) { case SADB_GETSPI: newsav->spi = 0; #ifdef IPSEC_DOSEQCHECK /* sync sequence number */ if (mhp->msg->sadb_msg_seq == 0) newsav->seq = (V_acq_seq = (V_acq_seq == ~0 ? 1 : ++V_acq_seq)); else #endif newsav->seq = mhp->msg->sadb_msg_seq; break; case SADB_ADD: /* sanity check */ if (mhp->ext[SADB_EXT_SA] == NULL) { free(newsav, M_IPSEC_SA); newsav = NULL; ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); *errp = EINVAL; goto done; } xsa = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA]; newsav->spi = xsa->sadb_sa_spi; newsav->seq = mhp->msg->sadb_msg_seq; break; default: free(newsav, M_IPSEC_SA); newsav = NULL; *errp = EINVAL; goto done; } /* copy sav values */ if (mhp->msg->sadb_msg_type != SADB_GETSPI) { *errp = key_setsaval(newsav, m, mhp); if (*errp) { free(newsav, M_IPSEC_SA); newsav = NULL; goto done; } } SECASVAR_LOCK_INIT(newsav); /* reset created */ newsav->created = time_second; newsav->pid = mhp->msg->sadb_msg_pid; /* add to satree */ newsav->sah = sah; sa_initref(newsav); newsav->state = SADB_SASTATE_LARVAL; /* XXX locking??? */ LIST_INSERT_TAIL(&sah->savtree[SADB_SASTATE_LARVAL], newsav, secasvar, chain); done: KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s from %s:%u return SP:%p\n", __func__, where, tag, newsav)); return newsav; } /* * free() SA variable entry. */ static void key_cleansav(struct secasvar *sav) { /* * Cleanup xform state. Note that zeroize'ing causes the * keys to be cleared; otherwise we must do it ourself. */ if (sav->tdb_xform != NULL) { sav->tdb_xform->xf_zeroize(sav); sav->tdb_xform = NULL; } else { KASSERT(sav->iv == NULL, ("iv but no xform")); if (sav->key_auth != NULL) bzero(sav->key_auth->key_data, _KEYLEN(sav->key_auth)); if (sav->key_enc != NULL) bzero(sav->key_enc->key_data, _KEYLEN(sav->key_enc)); } if (sav->key_auth != NULL) { if (sav->key_auth->key_data != NULL) free(sav->key_auth->key_data, M_IPSEC_MISC); free(sav->key_auth, M_IPSEC_MISC); sav->key_auth = NULL; } if (sav->key_enc != NULL) { if (sav->key_enc->key_data != NULL) free(sav->key_enc->key_data, M_IPSEC_MISC); free(sav->key_enc, M_IPSEC_MISC); sav->key_enc = NULL; } if (sav->sched) { bzero(sav->sched, sav->schedlen); free(sav->sched, M_IPSEC_MISC); sav->sched = NULL; } if (sav->replay != NULL) { free(sav->replay, M_IPSEC_MISC); sav->replay = NULL; } if (sav->lft_c != NULL) { free(sav->lft_c, M_IPSEC_MISC); sav->lft_c = NULL; } if (sav->lft_h != NULL) { free(sav->lft_h, M_IPSEC_MISC); sav->lft_h = NULL; } if (sav->lft_s != NULL) { free(sav->lft_s, M_IPSEC_MISC); sav->lft_s = NULL; } } /* * free() SA variable entry. */ static void key_delsav(sav) struct secasvar *sav; { IPSEC_ASSERT(sav != NULL, ("null sav")); IPSEC_ASSERT(sav->refcnt == 0, ("reference count %u > 0", sav->refcnt)); /* remove from SA header */ if (__LIST_CHAINED(sav)) LIST_REMOVE(sav, chain); key_cleansav(sav); SECASVAR_LOCK_DESTROY(sav); free(sav, M_IPSEC_SA); } /* * search SAD. * OUT: * NULL : not found * others : found, pointer to a SA. */ static struct secashead * key_getsah(saidx) struct secasindex *saidx; { INIT_VNET_IPSEC(curvnet); struct secashead *sah; SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, saidx, CMP_REQID)) break; } SAHTREE_UNLOCK(); return sah; } /* * check not to be duplicated SPI. * NOTE: this function is too slow due to searching all SAD. * OUT: * NULL : not found * others : found, pointer to a SA. */ static struct secasvar * key_checkspidup(saidx, spi) struct secasindex *saidx; u_int32_t spi; { INIT_VNET_IPSEC(curvnet); struct secashead *sah; struct secasvar *sav; /* check address family */ if (saidx->src.sa.sa_family != saidx->dst.sa.sa_family) { ipseclog((LOG_DEBUG, "%s: address family mismatched.\n", __func__)); return NULL; } sav = NULL; /* check all SAD */ SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (!key_ismyaddr((struct sockaddr *)&sah->saidx.dst)) continue; sav = key_getsavbyspi(sah, spi); if (sav != NULL) break; } SAHTREE_UNLOCK(); return sav; } /* * search SAD litmited alive SA, protocol, SPI. * OUT: * NULL : not found * others : found, pointer to a SA. */ static struct secasvar * key_getsavbyspi(sah, spi) struct secashead *sah; u_int32_t spi; { INIT_VNET_IPSEC(curvnet); struct secasvar *sav; u_int stateidx, state; sav = NULL; SAHTREE_LOCK_ASSERT(); /* search all status */ for (stateidx = 0; stateidx < _ARRAYLEN(V_saorder_state_alive); stateidx++) { state = V_saorder_state_alive[stateidx]; LIST_FOREACH(sav, &sah->savtree[state], chain) { /* sanity check */ if (sav->state != state) { ipseclog((LOG_DEBUG, "%s: " "invalid sav->state (queue: %d SA: %d)\n", __func__, state, sav->state)); continue; } if (sav->spi == spi) return sav; } } return NULL; } /* * copy SA values from PF_KEY message except *SPI, SEQ, PID, STATE and TYPE*. * You must update these if need. * OUT: 0: success. * !0: failure. * * does not modify mbuf. does not free mbuf on error. */ static int key_setsaval(sav, m, mhp) struct secasvar *sav; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); int error = 0; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* initialization */ sav->replay = NULL; sav->key_auth = NULL; sav->key_enc = NULL; sav->sched = NULL; sav->schedlen = 0; sav->iv = NULL; sav->lft_c = NULL; sav->lft_h = NULL; sav->lft_s = NULL; sav->tdb_xform = NULL; /* transform */ sav->tdb_encalgxform = NULL; /* encoding algorithm */ sav->tdb_authalgxform = NULL; /* authentication algorithm */ sav->tdb_compalgxform = NULL; /* compression algorithm */ /* SA */ if (mhp->ext[SADB_EXT_SA] != NULL) { const struct sadb_sa *sa0; sa0 = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA]; if (mhp->extlen[SADB_EXT_SA] < sizeof(*sa0)) { error = EINVAL; goto fail; } sav->alg_auth = sa0->sadb_sa_auth; sav->alg_enc = sa0->sadb_sa_encrypt; sav->flags = sa0->sadb_sa_flags; /* replay window */ if ((sa0->sadb_sa_flags & SADB_X_EXT_OLD) == 0) { sav->replay = (struct secreplay *) malloc(sizeof(struct secreplay)+sa0->sadb_sa_replay, M_IPSEC_MISC, M_NOWAIT|M_ZERO); if (sav->replay == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); error = ENOBUFS; goto fail; } if (sa0->sadb_sa_replay != 0) sav->replay->bitmap = (caddr_t)(sav->replay+1); sav->replay->wsize = sa0->sadb_sa_replay; } } /* Authentication keys */ if (mhp->ext[SADB_EXT_KEY_AUTH] != NULL) { const struct sadb_key *key0; int len; key0 = (const struct sadb_key *)mhp->ext[SADB_EXT_KEY_AUTH]; len = mhp->extlen[SADB_EXT_KEY_AUTH]; error = 0; if (len < sizeof(*key0)) { error = EINVAL; goto fail; } switch (mhp->msg->sadb_msg_satype) { case SADB_SATYPE_AH: case SADB_SATYPE_ESP: case SADB_X_SATYPE_TCPSIGNATURE: if (len == PFKEY_ALIGN8(sizeof(struct sadb_key)) && sav->alg_auth != SADB_X_AALG_NULL) error = EINVAL; break; case SADB_X_SATYPE_IPCOMP: default: error = EINVAL; break; } if (error) { ipseclog((LOG_DEBUG, "%s: invalid key_auth values.\n", __func__)); goto fail; } sav->key_auth = (struct seckey *)key_dup_keymsg(key0, len, M_IPSEC_MISC); if (sav->key_auth == NULL ) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); error = ENOBUFS; goto fail; } } /* Encryption key */ if (mhp->ext[SADB_EXT_KEY_ENCRYPT] != NULL) { const struct sadb_key *key0; int len; key0 = (const struct sadb_key *)mhp->ext[SADB_EXT_KEY_ENCRYPT]; len = mhp->extlen[SADB_EXT_KEY_ENCRYPT]; error = 0; if (len < sizeof(*key0)) { error = EINVAL; goto fail; } switch (mhp->msg->sadb_msg_satype) { case SADB_SATYPE_ESP: if (len == PFKEY_ALIGN8(sizeof(struct sadb_key)) && sav->alg_enc != SADB_EALG_NULL) { error = EINVAL; break; } sav->key_enc = (struct seckey *)key_dup_keymsg(key0, len, M_IPSEC_MISC); if (sav->key_enc == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); error = ENOBUFS; goto fail; } break; case SADB_X_SATYPE_IPCOMP: if (len != PFKEY_ALIGN8(sizeof(struct sadb_key))) error = EINVAL; sav->key_enc = NULL; /*just in case*/ break; case SADB_SATYPE_AH: case SADB_X_SATYPE_TCPSIGNATURE: default: error = EINVAL; break; } if (error) { ipseclog((LOG_DEBUG, "%s: invalid key_enc value.\n", __func__)); goto fail; } } /* set iv */ sav->ivlen = 0; switch (mhp->msg->sadb_msg_satype) { case SADB_SATYPE_AH: error = xform_init(sav, XF_AH); break; case SADB_SATYPE_ESP: error = xform_init(sav, XF_ESP); break; case SADB_X_SATYPE_IPCOMP: error = xform_init(sav, XF_IPCOMP); break; case SADB_X_SATYPE_TCPSIGNATURE: error = xform_init(sav, XF_TCPSIGNATURE); break; } if (error) { ipseclog((LOG_DEBUG, "%s: unable to initialize SA type %u.\n", __func__, mhp->msg->sadb_msg_satype)); goto fail; } /* reset created */ sav->created = time_second; /* make lifetime for CURRENT */ sav->lft_c = malloc(sizeof(struct seclifetime), M_IPSEC_MISC, M_NOWAIT); if (sav->lft_c == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); error = ENOBUFS; goto fail; } sav->lft_c->allocations = 0; sav->lft_c->bytes = 0; sav->lft_c->addtime = time_second; sav->lft_c->usetime = 0; /* lifetimes for HARD and SOFT */ { const struct sadb_lifetime *lft0; lft0 = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD]; if (lft0 != NULL) { if (mhp->extlen[SADB_EXT_LIFETIME_HARD] < sizeof(*lft0)) { error = EINVAL; goto fail; } sav->lft_h = key_dup_lifemsg(lft0, M_IPSEC_MISC); if (sav->lft_h == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__)); error = ENOBUFS; goto fail; } /* to be initialize ? */ } lft0 = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_SOFT]; if (lft0 != NULL) { if (mhp->extlen[SADB_EXT_LIFETIME_SOFT] < sizeof(*lft0)) { error = EINVAL; goto fail; } sav->lft_s = key_dup_lifemsg(lft0, M_IPSEC_MISC); if (sav->lft_s == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__)); error = ENOBUFS; goto fail; } /* to be initialize ? */ } } return 0; fail: /* initialization */ key_cleansav(sav); return error; } /* * validation with a secasvar entry, and set SADB_SATYPE_MATURE. * OUT: 0: valid * other: errno */ static int key_mature(struct secasvar *sav) { INIT_VNET_IPSEC(curvnet); int error; /* check SPI value */ switch (sav->sah->saidx.proto) { case IPPROTO_ESP: case IPPROTO_AH: /* * RFC 4302, 2.4. Security Parameters Index (SPI), SPI values * 1-255 reserved by IANA for future use, * 0 for implementation specific, local use. */ if (ntohl(sav->spi) <= 255) { ipseclog((LOG_DEBUG, "%s: illegal range of SPI %u.\n", __func__, (u_int32_t)ntohl(sav->spi))); return EINVAL; } break; } /* check satype */ switch (sav->sah->saidx.proto) { case IPPROTO_ESP: /* check flags */ if ((sav->flags & (SADB_X_EXT_OLD|SADB_X_EXT_DERIV)) == (SADB_X_EXT_OLD|SADB_X_EXT_DERIV)) { ipseclog((LOG_DEBUG, "%s: invalid flag (derived) " "given to old-esp.\n", __func__)); return EINVAL; } error = xform_init(sav, XF_ESP); break; case IPPROTO_AH: /* check flags */ if (sav->flags & SADB_X_EXT_DERIV) { ipseclog((LOG_DEBUG, "%s: invalid flag (derived) " "given to AH SA.\n", __func__)); return EINVAL; } if (sav->alg_enc != SADB_EALG_NONE) { ipseclog((LOG_DEBUG, "%s: protocol and algorithm " "mismated.\n", __func__)); return(EINVAL); } error = xform_init(sav, XF_AH); break; case IPPROTO_IPCOMP: if (sav->alg_auth != SADB_AALG_NONE) { ipseclog((LOG_DEBUG, "%s: protocol and algorithm " "mismated.\n", __func__)); return(EINVAL); } if ((sav->flags & SADB_X_EXT_RAWCPI) == 0 && ntohl(sav->spi) >= 0x10000) { ipseclog((LOG_DEBUG, "%s: invalid cpi for IPComp.\n", __func__)); return(EINVAL); } error = xform_init(sav, XF_IPCOMP); break; case IPPROTO_TCP: if (sav->alg_enc != SADB_EALG_NONE) { ipseclog((LOG_DEBUG, "%s: protocol and algorithm " "mismated.\n", __func__)); return(EINVAL); } error = xform_init(sav, XF_TCPSIGNATURE); break; default: ipseclog((LOG_DEBUG, "%s: Invalid satype.\n", __func__)); error = EPROTONOSUPPORT; break; } if (error == 0) { SAHTREE_LOCK(); key_sa_chgstate(sav, SADB_SASTATE_MATURE); SAHTREE_UNLOCK(); } return (error); } /* * subroutine for SADB_GET and SADB_DUMP. */ static struct mbuf * key_setdumpsa(sav, type, satype, seq, pid) struct secasvar *sav; u_int8_t type, satype; u_int32_t seq, pid; { struct mbuf *result = NULL, *tres = NULL, *m; int i; int dumporder[] = { SADB_EXT_SA, SADB_X_EXT_SA2, SADB_EXT_LIFETIME_HARD, SADB_EXT_LIFETIME_SOFT, SADB_EXT_LIFETIME_CURRENT, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST, SADB_EXT_ADDRESS_PROXY, SADB_EXT_KEY_AUTH, SADB_EXT_KEY_ENCRYPT, SADB_EXT_IDENTITY_SRC, SADB_EXT_IDENTITY_DST, SADB_EXT_SENSITIVITY, }; m = key_setsadbmsg(type, 0, satype, seq, pid, sav->refcnt); if (m == NULL) goto fail; result = m; for (i = sizeof(dumporder)/sizeof(dumporder[0]) - 1; i >= 0; i--) { m = NULL; switch (dumporder[i]) { case SADB_EXT_SA: m = key_setsadbsa(sav); if (!m) goto fail; break; case SADB_X_EXT_SA2: m = key_setsadbxsa2(sav->sah->saidx.mode, sav->replay ? sav->replay->count : 0, sav->sah->saidx.reqid); if (!m) goto fail; break; case SADB_EXT_ADDRESS_SRC: m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, &sav->sah->saidx.src.sa, FULLMASK, IPSEC_ULPROTO_ANY); if (!m) goto fail; break; case SADB_EXT_ADDRESS_DST: m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, &sav->sah->saidx.dst.sa, FULLMASK, IPSEC_ULPROTO_ANY); if (!m) goto fail; break; case SADB_EXT_KEY_AUTH: if (!sav->key_auth) continue; m = key_setkey(sav->key_auth, SADB_EXT_KEY_AUTH); if (!m) goto fail; break; case SADB_EXT_KEY_ENCRYPT: if (!sav->key_enc) continue; m = key_setkey(sav->key_enc, SADB_EXT_KEY_ENCRYPT); if (!m) goto fail; break; case SADB_EXT_LIFETIME_CURRENT: if (!sav->lft_c) continue; m = key_setlifetime(sav->lft_c, SADB_EXT_LIFETIME_CURRENT); if (!m) goto fail; break; case SADB_EXT_LIFETIME_HARD: if (!sav->lft_h) continue; m = key_setlifetime(sav->lft_h, SADB_EXT_LIFETIME_HARD); if (!m) goto fail; break; case SADB_EXT_LIFETIME_SOFT: if (!sav->lft_s) continue; m = key_setlifetime(sav->lft_s, SADB_EXT_LIFETIME_SOFT); if (!m) goto fail; break; case SADB_EXT_ADDRESS_PROXY: case SADB_EXT_IDENTITY_SRC: case SADB_EXT_IDENTITY_DST: /* XXX: should we brought from SPD ? */ case SADB_EXT_SENSITIVITY: default: continue; } if (!m) goto fail; if (tres) m_cat(m, tres); tres = m; } m_cat(result, tres); if (result->m_len < sizeof(struct sadb_msg)) { result = m_pullup(result, sizeof(struct sadb_msg)); if (result == NULL) goto fail; } result->m_pkthdr.len = 0; for (m = result; m; m = m->m_next) result->m_pkthdr.len += m->m_len; mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); return result; fail: m_freem(result); m_freem(tres); return NULL; } /* * set data into sadb_msg. */ static struct mbuf * key_setsadbmsg(type, tlen, satype, seq, pid, reserved) u_int8_t type, satype; u_int16_t tlen; u_int32_t seq; pid_t pid; u_int16_t reserved; { struct mbuf *m; struct sadb_msg *p; int len; len = PFKEY_ALIGN8(sizeof(struct sadb_msg)); if (len > MCLBYTES) return NULL; MGETHDR(m, M_DONTWAIT, MT_DATA); if (m && len > MHLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m = NULL; } } if (!m) return NULL; m->m_pkthdr.len = m->m_len = len; m->m_next = NULL; p = mtod(m, struct sadb_msg *); bzero(p, len); p->sadb_msg_version = PF_KEY_V2; p->sadb_msg_type = type; p->sadb_msg_errno = 0; p->sadb_msg_satype = satype; p->sadb_msg_len = PFKEY_UNIT64(tlen); p->sadb_msg_reserved = reserved; p->sadb_msg_seq = seq; p->sadb_msg_pid = (u_int32_t)pid; return m; } /* * copy secasvar data into sadb_address. */ static struct mbuf * key_setsadbsa(sav) struct secasvar *sav; { struct mbuf *m; struct sadb_sa *p; int len; len = PFKEY_ALIGN8(sizeof(struct sadb_sa)); m = key_alloc_mbuf(len); if (!m || m->m_next) { /*XXX*/ if (m) m_freem(m); return NULL; } p = mtod(m, struct sadb_sa *); bzero(p, len); p->sadb_sa_len = PFKEY_UNIT64(len); p->sadb_sa_exttype = SADB_EXT_SA; p->sadb_sa_spi = sav->spi; p->sadb_sa_replay = (sav->replay != NULL ? sav->replay->wsize : 0); p->sadb_sa_state = sav->state; p->sadb_sa_auth = sav->alg_auth; p->sadb_sa_encrypt = sav->alg_enc; p->sadb_sa_flags = sav->flags; return m; } /* * set data into sadb_address. */ static struct mbuf * key_setsadbaddr(exttype, saddr, prefixlen, ul_proto) u_int16_t exttype; const struct sockaddr *saddr; u_int8_t prefixlen; u_int16_t ul_proto; { struct mbuf *m; struct sadb_address *p; size_t len; len = PFKEY_ALIGN8(sizeof(struct sadb_address)) + PFKEY_ALIGN8(saddr->sa_len); m = key_alloc_mbuf(len); if (!m || m->m_next) { /*XXX*/ if (m) m_freem(m); return NULL; } p = mtod(m, struct sadb_address *); bzero(p, len); p->sadb_address_len = PFKEY_UNIT64(len); p->sadb_address_exttype = exttype; p->sadb_address_proto = ul_proto; if (prefixlen == FULLMASK) { switch (saddr->sa_family) { case AF_INET: prefixlen = sizeof(struct in_addr) << 3; break; case AF_INET6: prefixlen = sizeof(struct in6_addr) << 3; break; default: ; /*XXX*/ } } p->sadb_address_prefixlen = prefixlen; p->sadb_address_reserved = 0; bcopy(saddr, mtod(m, caddr_t) + PFKEY_ALIGN8(sizeof(struct sadb_address)), saddr->sa_len); return m; } /* * set data into sadb_x_sa2. */ static struct mbuf * key_setsadbxsa2(mode, seq, reqid) u_int8_t mode; u_int32_t seq, reqid; { struct mbuf *m; struct sadb_x_sa2 *p; size_t len; len = PFKEY_ALIGN8(sizeof(struct sadb_x_sa2)); m = key_alloc_mbuf(len); if (!m || m->m_next) { /*XXX*/ if (m) m_freem(m); return NULL; } p = mtod(m, struct sadb_x_sa2 *); bzero(p, len); p->sadb_x_sa2_len = PFKEY_UNIT64(len); p->sadb_x_sa2_exttype = SADB_X_EXT_SA2; p->sadb_x_sa2_mode = mode; p->sadb_x_sa2_reserved1 = 0; p->sadb_x_sa2_reserved2 = 0; p->sadb_x_sa2_sequence = seq; p->sadb_x_sa2_reqid = reqid; return m; } /* * set data into sadb_x_policy */ static struct mbuf * key_setsadbxpolicy(type, dir, id) u_int16_t type; u_int8_t dir; u_int32_t id; { struct mbuf *m; struct sadb_x_policy *p; size_t len; len = PFKEY_ALIGN8(sizeof(struct sadb_x_policy)); m = key_alloc_mbuf(len); if (!m || m->m_next) { /*XXX*/ if (m) m_freem(m); return NULL; } p = mtod(m, struct sadb_x_policy *); bzero(p, len); p->sadb_x_policy_len = PFKEY_UNIT64(len); p->sadb_x_policy_exttype = SADB_X_EXT_POLICY; p->sadb_x_policy_type = type; p->sadb_x_policy_dir = dir; p->sadb_x_policy_id = id; return m; } /* %%% utilities */ /* Take a key message (sadb_key) from the socket and turn it into one * of the kernel's key structures (seckey). * * IN: pointer to the src * OUT: NULL no more memory */ struct seckey * key_dup_keymsg(const struct sadb_key *src, u_int len, struct malloc_type *type) { INIT_VNET_IPSEC(curvnet); struct seckey *dst; dst = (struct seckey *)malloc(sizeof(struct seckey), type, M_NOWAIT); if (dst != NULL) { dst->bits = src->sadb_key_bits; dst->key_data = (char *)malloc(len, type, M_NOWAIT); if (dst->key_data != NULL) { bcopy((const char *)src + sizeof(struct sadb_key), dst->key_data, len); } else { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); free(dst, type); dst = NULL; } } else { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); } return dst; } /* Take a lifetime message (sadb_lifetime) passed in on a socket and * turn it into one of the kernel's lifetime structures (seclifetime). * * IN: pointer to the destination, source and malloc type * OUT: NULL, no more memory */ static struct seclifetime * key_dup_lifemsg(const struct sadb_lifetime *src, struct malloc_type *type) { INIT_VNET_IPSEC(curvnet); struct seclifetime *dst = NULL; dst = (struct seclifetime *)malloc(sizeof(struct seclifetime), type, M_NOWAIT); if (dst == NULL) { /* XXX counter */ ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); } else { dst->allocations = src->sadb_lifetime_allocations; dst->bytes = src->sadb_lifetime_bytes; dst->addtime = src->sadb_lifetime_addtime; dst->usetime = src->sadb_lifetime_usetime; } return dst; } /* compare my own address * OUT: 1: true, i.e. my address. * 0: false */ int key_ismyaddr(sa) struct sockaddr *sa; { #ifdef INET INIT_VNET_INET(curvnet); struct sockaddr_in *sin; struct in_ifaddr *ia; #endif IPSEC_ASSERT(sa != NULL, ("null sockaddr")); switch (sa->sa_family) { #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; for (ia = V_in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { if (sin->sin_family == ia->ia_addr.sin_family && sin->sin_len == ia->ia_addr.sin_len && sin->sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) { return 1; } } break; #endif #ifdef INET6 case AF_INET6: return key_ismyaddr6((struct sockaddr_in6 *)sa); #endif } return 0; } #ifdef INET6 /* * compare my own address for IPv6. * 1: ours * 0: other * NOTE: derived ip6_input() in KAME. This is necessary to modify more. */ #include static int key_ismyaddr6(sin6) struct sockaddr_in6 *sin6; { INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia; struct in6_multi *in6m; for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) { if (key_sockaddrcmp((struct sockaddr *)&sin6, (struct sockaddr *)&ia->ia_addr, 0) == 0) return 1; /* * XXX Multicast * XXX why do we care about multlicast here while we don't care * about IPv4 multicast?? * XXX scope */ in6m = NULL; IN6_LOOKUP_MULTI(sin6->sin6_addr, ia->ia_ifp, in6m); if (in6m) return 1; } /* loopback, just for safety */ if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr)) return 1; return 0; } #endif /*INET6*/ /* * compare two secasindex structure. * flag can specify to compare 2 saidxes. * compare two secasindex structure without both mode and reqid. * don't compare port. * IN: * saidx0: source, it can be in SAD. * saidx1: object. * OUT: * 1 : equal * 0 : not equal */ static int key_cmpsaidx( const struct secasindex *saidx0, const struct secasindex *saidx1, int flag) { /* sanity */ if (saidx0 == NULL && saidx1 == NULL) return 1; if (saidx0 == NULL || saidx1 == NULL) return 0; if (saidx0->proto != saidx1->proto) return 0; if (flag == CMP_EXACTLY) { if (saidx0->mode != saidx1->mode) return 0; if (saidx0->reqid != saidx1->reqid) return 0; if (bcmp(&saidx0->src, &saidx1->src, saidx0->src.sa.sa_len) != 0 || bcmp(&saidx0->dst, &saidx1->dst, saidx0->dst.sa.sa_len) != 0) return 0; } else { /* CMP_MODE_REQID, CMP_REQID, CMP_HEAD */ if (flag == CMP_MODE_REQID ||flag == CMP_REQID) { /* * If reqid of SPD is non-zero, unique SA is required. * The result must be of same reqid in this case. */ if (saidx1->reqid != 0 && saidx0->reqid != saidx1->reqid) return 0; } if (flag == CMP_MODE_REQID) { if (saidx0->mode != IPSEC_MODE_ANY && saidx0->mode != saidx1->mode) return 0; } if (key_sockaddrcmp(&saidx0->src.sa, &saidx1->src.sa, 0) != 0) { return 0; } if (key_sockaddrcmp(&saidx0->dst.sa, &saidx1->dst.sa, 0) != 0) { return 0; } } return 1; } /* * compare two secindex structure exactly. * IN: * spidx0: source, it is often in SPD. * spidx1: object, it is often from PFKEY message. * OUT: * 1 : equal * 0 : not equal */ static int key_cmpspidx_exactly( struct secpolicyindex *spidx0, struct secpolicyindex *spidx1) { /* sanity */ if (spidx0 == NULL && spidx1 == NULL) return 1; if (spidx0 == NULL || spidx1 == NULL) return 0; if (spidx0->prefs != spidx1->prefs || spidx0->prefd != spidx1->prefd || spidx0->ul_proto != spidx1->ul_proto) return 0; return key_sockaddrcmp(&spidx0->src.sa, &spidx1->src.sa, 1) == 0 && key_sockaddrcmp(&spidx0->dst.sa, &spidx1->dst.sa, 1) == 0; } /* * compare two secindex structure with mask. * IN: * spidx0: source, it is often in SPD. * spidx1: object, it is often from IP header. * OUT: * 1 : equal * 0 : not equal */ static int key_cmpspidx_withmask( struct secpolicyindex *spidx0, struct secpolicyindex *spidx1) { /* sanity */ if (spidx0 == NULL && spidx1 == NULL) return 1; if (spidx0 == NULL || spidx1 == NULL) return 0; if (spidx0->src.sa.sa_family != spidx1->src.sa.sa_family || spidx0->dst.sa.sa_family != spidx1->dst.sa.sa_family || spidx0->src.sa.sa_len != spidx1->src.sa.sa_len || spidx0->dst.sa.sa_len != spidx1->dst.sa.sa_len) return 0; /* if spidx.ul_proto == IPSEC_ULPROTO_ANY, ignore. */ if (spidx0->ul_proto != (u_int16_t)IPSEC_ULPROTO_ANY && spidx0->ul_proto != spidx1->ul_proto) return 0; switch (spidx0->src.sa.sa_family) { case AF_INET: if (spidx0->src.sin.sin_port != IPSEC_PORT_ANY && spidx0->src.sin.sin_port != spidx1->src.sin.sin_port) return 0; if (!key_bbcmp(&spidx0->src.sin.sin_addr, &spidx1->src.sin.sin_addr, spidx0->prefs)) return 0; break; case AF_INET6: if (spidx0->src.sin6.sin6_port != IPSEC_PORT_ANY && spidx0->src.sin6.sin6_port != spidx1->src.sin6.sin6_port) return 0; /* * scope_id check. if sin6_scope_id is 0, we regard it * as a wildcard scope, which matches any scope zone ID. */ if (spidx0->src.sin6.sin6_scope_id && spidx1->src.sin6.sin6_scope_id && spidx0->src.sin6.sin6_scope_id != spidx1->src.sin6.sin6_scope_id) return 0; if (!key_bbcmp(&spidx0->src.sin6.sin6_addr, &spidx1->src.sin6.sin6_addr, spidx0->prefs)) return 0; break; default: /* XXX */ if (bcmp(&spidx0->src, &spidx1->src, spidx0->src.sa.sa_len) != 0) return 0; break; } switch (spidx0->dst.sa.sa_family) { case AF_INET: if (spidx0->dst.sin.sin_port != IPSEC_PORT_ANY && spidx0->dst.sin.sin_port != spidx1->dst.sin.sin_port) return 0; if (!key_bbcmp(&spidx0->dst.sin.sin_addr, &spidx1->dst.sin.sin_addr, spidx0->prefd)) return 0; break; case AF_INET6: if (spidx0->dst.sin6.sin6_port != IPSEC_PORT_ANY && spidx0->dst.sin6.sin6_port != spidx1->dst.sin6.sin6_port) return 0; /* * scope_id check. if sin6_scope_id is 0, we regard it * as a wildcard scope, which matches any scope zone ID. */ if (spidx0->dst.sin6.sin6_scope_id && spidx1->dst.sin6.sin6_scope_id && spidx0->dst.sin6.sin6_scope_id != spidx1->dst.sin6.sin6_scope_id) return 0; if (!key_bbcmp(&spidx0->dst.sin6.sin6_addr, &spidx1->dst.sin6.sin6_addr, spidx0->prefd)) return 0; break; default: /* XXX */ if (bcmp(&spidx0->dst, &spidx1->dst, spidx0->dst.sa.sa_len) != 0) return 0; break; } /* XXX Do we check other field ? e.g. flowinfo */ return 1; } /* returns 0 on match */ static int key_sockaddrcmp( const struct sockaddr *sa1, const struct sockaddr *sa2, int port) { #ifdef satosin #undef satosin #endif #define satosin(s) ((const struct sockaddr_in *)s) #ifdef satosin6 #undef satosin6 #endif #define satosin6(s) ((const struct sockaddr_in6 *)s) if (sa1->sa_family != sa2->sa_family || sa1->sa_len != sa2->sa_len) return 1; switch (sa1->sa_family) { case AF_INET: if (sa1->sa_len != sizeof(struct sockaddr_in)) return 1; if (satosin(sa1)->sin_addr.s_addr != satosin(sa2)->sin_addr.s_addr) { return 1; } if (port && satosin(sa1)->sin_port != satosin(sa2)->sin_port) return 1; break; case AF_INET6: if (sa1->sa_len != sizeof(struct sockaddr_in6)) return 1; /*EINVAL*/ if (satosin6(sa1)->sin6_scope_id != satosin6(sa2)->sin6_scope_id) { return 1; } if (!IN6_ARE_ADDR_EQUAL(&satosin6(sa1)->sin6_addr, &satosin6(sa2)->sin6_addr)) { return 1; } if (port && satosin6(sa1)->sin6_port != satosin6(sa2)->sin6_port) { return 1; } break; default: if (bcmp(sa1, sa2, sa1->sa_len) != 0) return 1; break; } return 0; #undef satosin #undef satosin6 } /* * compare two buffers with mask. * IN: * addr1: source * addr2: object * bits: Number of bits to compare * OUT: * 1 : equal * 0 : not equal */ static int key_bbcmp(const void *a1, const void *a2, u_int bits) { const unsigned char *p1 = a1; const unsigned char *p2 = a2; /* XXX: This could be considerably faster if we compare a word * at a time, but it is complicated on LSB Endian machines */ /* Handle null pointers */ if (p1 == NULL || p2 == NULL) return (p1 == p2); while (bits >= 8) { if (*p1++ != *p2++) return 0; bits -= 8; } if (bits > 0) { u_int8_t mask = ~((1<<(8-bits))-1); if ((*p1 & mask) != (*p2 & mask)) return 0; } return 1; /* Match! */ } static void key_flush_spd(time_t now) { INIT_VNET_IPSEC(curvnet); static u_int16_t sptree_scangen = 0; u_int16_t gen = sptree_scangen++; struct secpolicy *sp; u_int dir; /* SPD */ for (dir = 0; dir < IPSEC_DIR_MAX; dir++) { restart: SPTREE_LOCK(); LIST_FOREACH(sp, &V_sptree[dir], chain) { if (sp->scangen == gen) /* previously handled */ continue; sp->scangen = gen; if (sp->state == IPSEC_SPSTATE_DEAD) { /* NB: clean entries created by key_spdflush */ SPTREE_UNLOCK(); KEY_FREESP(&sp); goto restart; } if (sp->lifetime == 0 && sp->validtime == 0) continue; if ((sp->lifetime && now - sp->created > sp->lifetime) || (sp->validtime && now - sp->lastused > sp->validtime)) { sp->state = IPSEC_SPSTATE_DEAD; SPTREE_UNLOCK(); key_spdexpire(sp); KEY_FREESP(&sp); goto restart; } } SPTREE_UNLOCK(); } } static void key_flush_sad(time_t now) { INIT_VNET_IPSEC(curvnet); struct secashead *sah, *nextsah; struct secasvar *sav, *nextsav; /* SAD */ SAHTREE_LOCK(); LIST_FOREACH_SAFE(sah, &V_sahtree, chain, nextsah) { /* if sah has been dead, then delete it and process next sah. */ if (sah->state == SADB_SASTATE_DEAD) { key_delsah(sah); continue; } /* if LARVAL entry doesn't become MATURE, delete it. */ LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_LARVAL], chain, nextsav) { if (now - sav->created > V_key_larval_lifetime) KEY_FREESAV(&sav); } /* * check MATURE entry to start to send expire message * whether or not. */ LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_MATURE], chain, nextsav) { /* we don't need to check. */ if (sav->lft_s == NULL) continue; /* sanity check */ if (sav->lft_c == NULL) { ipseclog((LOG_DEBUG,"%s: there is no CURRENT " "time, why?\n", __func__)); continue; } /* check SOFT lifetime */ if (sav->lft_s->addtime != 0 && now - sav->created > sav->lft_s->addtime) { /* * check SA to be used whether or not. * when SA hasn't been used, delete it. */ if (sav->lft_c->usetime == 0) { key_sa_chgstate(sav, SADB_SASTATE_DEAD); KEY_FREESAV(&sav); } else { key_sa_chgstate(sav, SADB_SASTATE_DYING); /* * XXX If we keep to send expire * message in the status of * DYING. Do remove below code. */ key_expire(sav); } } /* check SOFT lifetime by bytes */ /* * XXX I don't know the way to delete this SA * when new SA is installed. Caution when it's * installed too big lifetime by time. */ else if (sav->lft_s->bytes != 0 && sav->lft_s->bytes < sav->lft_c->bytes) { key_sa_chgstate(sav, SADB_SASTATE_DYING); /* * XXX If we keep to send expire * message in the status of * DYING. Do remove below code. */ key_expire(sav); } } /* check DYING entry to change status to DEAD. */ LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_DYING], chain, nextsav) { /* we don't need to check. */ if (sav->lft_h == NULL) continue; /* sanity check */ if (sav->lft_c == NULL) { ipseclog((LOG_DEBUG, "%s: there is no CURRENT " "time, why?\n", __func__)); continue; } if (sav->lft_h->addtime != 0 && now - sav->created > sav->lft_h->addtime) { key_sa_chgstate(sav, SADB_SASTATE_DEAD); KEY_FREESAV(&sav); } #if 0 /* XXX Should we keep to send expire message until HARD lifetime ? */ else if (sav->lft_s != NULL && sav->lft_s->addtime != 0 && now - sav->created > sav->lft_s->addtime) { /* * XXX: should be checked to be * installed the valid SA. */ /* * If there is no SA then sending * expire message. */ key_expire(sav); } #endif /* check HARD lifetime by bytes */ else if (sav->lft_h->bytes != 0 && sav->lft_h->bytes < sav->lft_c->bytes) { key_sa_chgstate(sav, SADB_SASTATE_DEAD); KEY_FREESAV(&sav); } } /* delete entry in DEAD */ LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_DEAD], chain, nextsav) { /* sanity check */ if (sav->state != SADB_SASTATE_DEAD) { ipseclog((LOG_DEBUG, "%s: invalid sav->state " "(queue: %d SA: %d): kill it anyway\n", __func__, SADB_SASTATE_DEAD, sav->state)); } /* * do not call key_freesav() here. * sav should already be freed, and sav->refcnt * shows other references to sav * (such as from SPD). */ } } SAHTREE_UNLOCK(); } static void key_flush_acq(time_t now) { INIT_VNET_IPSEC(curvnet); struct secacq *acq, *nextacq; /* ACQ tree */ ACQ_LOCK(); for (acq = LIST_FIRST(&V_acqtree); acq != NULL; acq = nextacq) { nextacq = LIST_NEXT(acq, chain); if (now - acq->created > V_key_blockacq_lifetime && __LIST_CHAINED(acq)) { LIST_REMOVE(acq, chain); free(acq, M_IPSEC_SAQ); } } ACQ_UNLOCK(); } static void key_flush_spacq(time_t now) { INIT_VNET_IPSEC(curvnet); struct secspacq *acq, *nextacq; /* SP ACQ tree */ SPACQ_LOCK(); for (acq = LIST_FIRST(&V_spacqtree); acq != NULL; acq = nextacq) { nextacq = LIST_NEXT(acq, chain); if (now - acq->created > V_key_blockacq_lifetime && __LIST_CHAINED(acq)) { LIST_REMOVE(acq, chain); free(acq, M_IPSEC_SAQ); } } SPACQ_UNLOCK(); } /* * time handler. * scanning SPD and SAD to check status for each entries, * and do to remove or to expire. * XXX: year 2038 problem may remain. */ void key_timehandler(void) { VNET_ITERATOR_DECL(vnet_iter); time_t now = time_second; VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); key_flush_spd(now); key_flush_sad(now); key_flush_acq(now); key_flush_spacq(now); CURVNET_RESTORE(); } #ifndef IPSEC_DEBUG2 /* do exchange to tick time !! */ (void)timeout((void *)key_timehandler, (void *)0, hz); #endif /* IPSEC_DEBUG2 */ } u_long key_random() { u_long value; key_randomfill(&value, sizeof(value)); return value; } void key_randomfill(p, l) void *p; size_t l; { size_t n; u_long v; static int warn = 1; n = 0; n = (size_t)read_random(p, (u_int)l); /* last resort */ while (n < l) { v = random(); bcopy(&v, (u_int8_t *)p + n, l - n < sizeof(v) ? l - n : sizeof(v)); n += sizeof(v); if (warn) { printf("WARNING: pseudo-random number generator " "used for IPsec processing\n"); warn = 0; } } } /* * map SADB_SATYPE_* to IPPROTO_*. * if satype == SADB_SATYPE then satype is mapped to ~0. * OUT: * 0: invalid satype. */ static u_int16_t key_satype2proto(satype) u_int8_t satype; { switch (satype) { case SADB_SATYPE_UNSPEC: return IPSEC_PROTO_ANY; case SADB_SATYPE_AH: return IPPROTO_AH; case SADB_SATYPE_ESP: return IPPROTO_ESP; case SADB_X_SATYPE_IPCOMP: return IPPROTO_IPCOMP; case SADB_X_SATYPE_TCPSIGNATURE: return IPPROTO_TCP; default: return 0; } /* NOTREACHED */ } /* * map IPPROTO_* to SADB_SATYPE_* * OUT: * 0: invalid protocol type. */ static u_int8_t key_proto2satype(proto) u_int16_t proto; { switch (proto) { case IPPROTO_AH: return SADB_SATYPE_AH; case IPPROTO_ESP: return SADB_SATYPE_ESP; case IPPROTO_IPCOMP: return SADB_X_SATYPE_IPCOMP; case IPPROTO_TCP: return SADB_X_SATYPE_TCPSIGNATURE; default: return 0; } /* NOTREACHED */ } /* %%% PF_KEY */ /* * SADB_GETSPI processing is to receive * * from the IKMPd, to assign a unique spi value, to hang on the INBOUND * tree with the status of LARVAL, and send * * to the IKMPd. * * IN: mhp: pointer to the pointer to each header. * OUT: NULL if fail. * other if success, return pointer to the message to send. */ static int key_getspi(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *newsah; struct secasvar *newsav; u_int8_t proto; u_int32_t spi; u_int8_t mode; u_int32_t reqid; int error; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_X_EXT_SA2] != NULL) { mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode; reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid; } else { mode = IPSEC_MODE_ANY; reqid = 0; } src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]); dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]); /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } /* make sure if port number is zero. */ switch (((struct sockaddr *)(src0 + 1))->sa_family) { case AF_INET: if (((struct sockaddr *)(src0 + 1))->sa_len != sizeof(struct sockaddr_in)) return key_senderror(so, m, EINVAL); ((struct sockaddr_in *)(src0 + 1))->sin_port = 0; break; case AF_INET6: if (((struct sockaddr *)(src0 + 1))->sa_len != sizeof(struct sockaddr_in6)) return key_senderror(so, m, EINVAL); ((struct sockaddr_in6 *)(src0 + 1))->sin6_port = 0; break; default: ; /*???*/ } switch (((struct sockaddr *)(dst0 + 1))->sa_family) { case AF_INET: if (((struct sockaddr *)(dst0 + 1))->sa_len != sizeof(struct sockaddr_in)) return key_senderror(so, m, EINVAL); ((struct sockaddr_in *)(dst0 + 1))->sin_port = 0; break; case AF_INET6: if (((struct sockaddr *)(dst0 + 1))->sa_len != sizeof(struct sockaddr_in6)) return key_senderror(so, m, EINVAL); ((struct sockaddr_in6 *)(dst0 + 1))->sin6_port = 0; break; default: ; /*???*/ } /* XXX boundary check against sa_len */ KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx); /* SPI allocation */ spi = key_do_getnewspi((struct sadb_spirange *)mhp->ext[SADB_EXT_SPIRANGE], &saidx); if (spi == 0) return key_senderror(so, m, EINVAL); /* get a SA index */ if ((newsah = key_getsah(&saidx)) == NULL) { /* create a new SA index */ if ((newsah = key_newsah(&saidx)) == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__)); return key_senderror(so, m, ENOBUFS); } } /* get a new SA */ /* XXX rewrite */ newsav = KEY_NEWSAV(m, mhp, newsah, &error); if (newsav == NULL) { /* XXX don't free new SA index allocated in above. */ return key_senderror(so, m, error); } /* set spi */ newsav->spi = htonl(spi); /* delete the entry in acqtree */ if (mhp->msg->sadb_msg_seq != 0) { struct secacq *acq; if ((acq = key_getacqbyseq(mhp->msg->sadb_msg_seq)) != NULL) { /* reset counter in order to deletion by timehandler. */ acq->created = time_second; acq->count = 0; } } { struct mbuf *n, *nn; struct sadb_sa *m_sa; struct sadb_msg *newmsg; int off, len; /* create new sadb_msg to reply. */ len = PFKEY_ALIGN8(sizeof(struct sadb_msg)) + PFKEY_ALIGN8(sizeof(struct sadb_sa)); MGETHDR(n, M_DONTWAIT, MT_DATA); if (len > MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; } } if (!n) return key_senderror(so, m, ENOBUFS); n->m_len = len; n->m_next = NULL; off = 0; m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off); off += PFKEY_ALIGN8(sizeof(struct sadb_msg)); m_sa = (struct sadb_sa *)(mtod(n, caddr_t) + off); m_sa->sadb_sa_len = PFKEY_UNIT64(sizeof(struct sadb_sa)); m_sa->sadb_sa_exttype = SADB_EXT_SA; m_sa->sadb_sa_spi = htonl(spi); off += PFKEY_ALIGN8(sizeof(struct sadb_sa)); IPSEC_ASSERT(off == len, ("length inconsistency (off %u len %u)", off, len)); n->m_next = key_gather_mbuf(m, mhp, 0, 2, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST); if (!n->m_next) { m_freem(n); return key_senderror(so, m, ENOBUFS); } if (n->m_len < sizeof(struct sadb_msg)) { n = m_pullup(n, sizeof(struct sadb_msg)); if (n == NULL) return key_sendup_mbuf(so, m, KEY_SENDUP_ONE); } n->m_pkthdr.len = 0; for (nn = n; nn; nn = nn->m_next) n->m_pkthdr.len += nn->m_len; newmsg = mtod(n, struct sadb_msg *); newmsg->sadb_msg_seq = newsav->seq; newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len); m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ONE); } } /* * allocating new SPI * called by key_getspi(). * OUT: * 0: failure. * others: success. */ static u_int32_t key_do_getnewspi(spirange, saidx) struct sadb_spirange *spirange; struct secasindex *saidx; { INIT_VNET_IPSEC(curvnet); u_int32_t newspi; u_int32_t min, max; int count = V_key_spi_trycnt; /* set spi range to allocate */ if (spirange != NULL) { min = spirange->sadb_spirange_min; max = spirange->sadb_spirange_max; } else { min = V_key_spi_minval; max = V_key_spi_maxval; } /* IPCOMP needs 2-byte SPI */ if (saidx->proto == IPPROTO_IPCOMP) { u_int32_t t; if (min >= 0x10000) min = 0xffff; if (max >= 0x10000) max = 0xffff; if (min > max) { t = min; min = max; max = t; } } if (min == max) { if (key_checkspidup(saidx, min) != NULL) { ipseclog((LOG_DEBUG, "%s: SPI %u exists already.\n", __func__, min)); return 0; } count--; /* taking one cost. */ newspi = min; } else { /* init SPI */ newspi = 0; /* when requesting to allocate spi ranged */ while (count--) { /* generate pseudo-random SPI value ranged. */ newspi = min + (key_random() % (max - min + 1)); if (key_checkspidup(saidx, newspi) == NULL) break; } if (count == 0 || newspi == 0) { ipseclog((LOG_DEBUG, "%s: to allocate spi is failed.\n", __func__)); return 0; } } /* statistics */ keystat.getspi_count = (keystat.getspi_count + V_key_spi_trycnt - count) / 2; return newspi; } /* * SADB_UPDATE processing * receive * * from the ikmpd, and update a secasvar entry whose status is SADB_SASTATE_LARVAL. * and send * * to the ikmpd. * * m will always be freed. */ static int key_update(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); struct sadb_sa *sa0; struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *sah; struct secasvar *sav; u_int16_t proto; u_int8_t mode; u_int32_t reqid; int error; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_SA] == NULL || mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL || (mhp->msg->sadb_msg_satype == SADB_SATYPE_ESP && mhp->ext[SADB_EXT_KEY_ENCRYPT] == NULL) || (mhp->msg->sadb_msg_satype == SADB_SATYPE_AH && mhp->ext[SADB_EXT_KEY_AUTH] == NULL) || (mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL && mhp->ext[SADB_EXT_LIFETIME_SOFT] == NULL) || (mhp->ext[SADB_EXT_LIFETIME_HARD] == NULL && mhp->ext[SADB_EXT_LIFETIME_SOFT] != NULL)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) || mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_X_EXT_SA2] != NULL) { mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode; reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid; } else { mode = IPSEC_MODE_ANY; reqid = 0; } /* XXX boundary checking for other extensions */ sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA]; src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]); dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]); /* XXX boundary check against sa_len */ KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx); /* get a SA header */ if ((sah = key_getsah(&saidx)) == NULL) { ipseclog((LOG_DEBUG, "%s: no SA index found.\n", __func__)); return key_senderror(so, m, ENOENT); } /* set spidx if there */ /* XXX rewrite */ error = key_setident(sah, m, mhp); if (error) return key_senderror(so, m, error); /* find a SA with sequence number. */ #ifdef IPSEC_DOSEQCHECK if (mhp->msg->sadb_msg_seq != 0 && (sav = key_getsavbyseq(sah, mhp->msg->sadb_msg_seq)) == NULL) { ipseclog((LOG_DEBUG, "%s: no larval SA with sequence %u " "exists.\n", __func__, mhp->msg->sadb_msg_seq)); return key_senderror(so, m, ENOENT); } #else SAHTREE_LOCK(); sav = key_getsavbyspi(sah, sa0->sadb_sa_spi); SAHTREE_UNLOCK(); if (sav == NULL) { ipseclog((LOG_DEBUG, "%s: no such a SA found (spi:%u)\n", __func__, (u_int32_t)ntohl(sa0->sadb_sa_spi))); return key_senderror(so, m, EINVAL); } #endif /* validity check */ if (sav->sah->saidx.proto != proto) { ipseclog((LOG_DEBUG, "%s: protocol mismatched " "(DB=%u param=%u)\n", __func__, sav->sah->saidx.proto, proto)); return key_senderror(so, m, EINVAL); } #ifdef IPSEC_DOSEQCHECK if (sav->spi != sa0->sadb_sa_spi) { ipseclog((LOG_DEBUG, "%s: SPI mismatched (DB:%u param:%u)\n", __func__, (u_int32_t)ntohl(sav->spi), (u_int32_t)ntohl(sa0->sadb_sa_spi))); return key_senderror(so, m, EINVAL); } #endif if (sav->pid != mhp->msg->sadb_msg_pid) { ipseclog((LOG_DEBUG, "%s: pid mismatched (DB:%u param:%u)\n", __func__, sav->pid, mhp->msg->sadb_msg_pid)); return key_senderror(so, m, EINVAL); } /* copy sav values */ error = key_setsaval(sav, m, mhp); if (error) { KEY_FREESAV(&sav); return key_senderror(so, m, error); } /* check SA values to be mature. */ if ((mhp->msg->sadb_msg_errno = key_mature(sav)) != 0) { KEY_FREESAV(&sav); return key_senderror(so, m, 0); } { struct mbuf *n; /* set msg buf from mhp */ n = key_getmsgbuf_x1(m, mhp); if (n == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return key_senderror(so, m, ENOBUFS); } m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ALL); } } /* * search SAD with sequence for a SA which state is SADB_SASTATE_LARVAL. * only called by key_update(). * OUT: * NULL : not found * others : found, pointer to a SA. */ #ifdef IPSEC_DOSEQCHECK static struct secasvar * key_getsavbyseq(sah, seq) struct secashead *sah; u_int32_t seq; { struct secasvar *sav; u_int state; state = SADB_SASTATE_LARVAL; /* search SAD with sequence number ? */ LIST_FOREACH(sav, &sah->savtree[state], chain) { KEY_CHKSASTATE(state, sav->state, __func__); if (sav->seq == seq) { sa_addref(sav); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s cause refcnt++:%d SA:%p\n", __func__, sav->refcnt, sav)); return sav; } } return NULL; } #endif /* * SADB_ADD processing * add an entry to SA database, when received * * from the ikmpd, * and send * * to the ikmpd. * * IGNORE identity and sensitivity messages. * * m will always be freed. */ static int key_add(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); struct sadb_sa *sa0; struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *newsah; struct secasvar *newsav; u_int16_t proto; u_int8_t mode; u_int32_t reqid; int error; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_SA] == NULL || mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL || (mhp->msg->sadb_msg_satype == SADB_SATYPE_ESP && mhp->ext[SADB_EXT_KEY_ENCRYPT] == NULL) || (mhp->msg->sadb_msg_satype == SADB_SATYPE_AH && mhp->ext[SADB_EXT_KEY_AUTH] == NULL) || (mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL && mhp->ext[SADB_EXT_LIFETIME_SOFT] == NULL) || (mhp->ext[SADB_EXT_LIFETIME_HARD] == NULL && mhp->ext[SADB_EXT_LIFETIME_SOFT] != NULL)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) || mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { /* XXX need more */ ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_X_EXT_SA2] != NULL) { mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode; reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid; } else { mode = IPSEC_MODE_ANY; reqid = 0; } sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA]; src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC]; dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST]; /* XXX boundary check against sa_len */ KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx); /* get a SA header */ if ((newsah = key_getsah(&saidx)) == NULL) { /* create a new SA header */ if ((newsah = key_newsah(&saidx)) == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__)); return key_senderror(so, m, ENOBUFS); } } /* set spidx if there */ /* XXX rewrite */ error = key_setident(newsah, m, mhp); if (error) { return key_senderror(so, m, error); } /* create new SA entry. */ /* We can create new SA only if SPI is differenct. */ SAHTREE_LOCK(); newsav = key_getsavbyspi(newsah, sa0->sadb_sa_spi); SAHTREE_UNLOCK(); if (newsav != NULL) { ipseclog((LOG_DEBUG, "%s: SA already exists.\n", __func__)); return key_senderror(so, m, EEXIST); } newsav = KEY_NEWSAV(m, mhp, newsah, &error); if (newsav == NULL) { return key_senderror(so, m, error); } /* check SA values to be mature. */ if ((error = key_mature(newsav)) != 0) { KEY_FREESAV(&newsav); return key_senderror(so, m, error); } /* * don't call key_freesav() here, as we would like to keep the SA * in the database on success. */ { struct mbuf *n; /* set msg buf from mhp */ n = key_getmsgbuf_x1(m, mhp); if (n == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return key_senderror(so, m, ENOBUFS); } m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ALL); } } /* m is retained */ static int key_setident(sah, m, mhp) struct secashead *sah; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); const struct sadb_ident *idsrc, *iddst; int idsrclen, iddstlen; IPSEC_ASSERT(sah != NULL, ("null secashead")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* don't make buffer if not there */ if (mhp->ext[SADB_EXT_IDENTITY_SRC] == NULL && mhp->ext[SADB_EXT_IDENTITY_DST] == NULL) { sah->idents = NULL; sah->identd = NULL; return 0; } if (mhp->ext[SADB_EXT_IDENTITY_SRC] == NULL || mhp->ext[SADB_EXT_IDENTITY_DST] == NULL) { ipseclog((LOG_DEBUG, "%s: invalid identity.\n", __func__)); return EINVAL; } idsrc = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_SRC]; iddst = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_DST]; idsrclen = mhp->extlen[SADB_EXT_IDENTITY_SRC]; iddstlen = mhp->extlen[SADB_EXT_IDENTITY_DST]; /* validity check */ if (idsrc->sadb_ident_type != iddst->sadb_ident_type) { ipseclog((LOG_DEBUG, "%s: ident type mismatch.\n", __func__)); return EINVAL; } switch (idsrc->sadb_ident_type) { case SADB_IDENTTYPE_PREFIX: case SADB_IDENTTYPE_FQDN: case SADB_IDENTTYPE_USERFQDN: default: /* XXX do nothing */ sah->idents = NULL; sah->identd = NULL; return 0; } /* make structure */ sah->idents = malloc(sizeof(struct secident), M_IPSEC_MISC, M_NOWAIT); if (sah->idents == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return ENOBUFS; } sah->identd = malloc(sizeof(struct secident), M_IPSEC_MISC, M_NOWAIT); if (sah->identd == NULL) { free(sah->idents, M_IPSEC_MISC); sah->idents = NULL; ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return ENOBUFS; } sah->idents->type = idsrc->sadb_ident_type; sah->idents->id = idsrc->sadb_ident_id; sah->identd->type = iddst->sadb_ident_type; sah->identd->id = iddst->sadb_ident_id; return 0; } /* * m will not be freed on return. * it is caller's responsibility to free the result. */ static struct mbuf * key_getmsgbuf_x1(m, mhp) struct mbuf *m; const struct sadb_msghdr *mhp; { struct mbuf *n; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* create new sadb_msg to reply. */ n = key_gather_mbuf(m, mhp, 1, 9, SADB_EXT_RESERVED, SADB_EXT_SA, SADB_X_EXT_SA2, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST, SADB_EXT_LIFETIME_HARD, SADB_EXT_LIFETIME_SOFT, SADB_EXT_IDENTITY_SRC, SADB_EXT_IDENTITY_DST); if (!n) return NULL; if (n->m_len < sizeof(struct sadb_msg)) { n = m_pullup(n, sizeof(struct sadb_msg)); if (n == NULL) return NULL; } mtod(n, struct sadb_msg *)->sadb_msg_errno = 0; mtod(n, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len); return n; } static int key_delete_all __P((struct socket *, struct mbuf *, const struct sadb_msghdr *, u_int16_t)); /* * SADB_DELETE processing * receive * * from the ikmpd, and set SADB_SASTATE_DEAD, * and send, * * to the ikmpd. * * m will always be freed. */ static int key_delete(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); struct sadb_sa *sa0; struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *sah; struct secasvar *sav = NULL; u_int16_t proto; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_SA] == NULL) { /* * Caller wants us to delete all non-LARVAL SAs * that match the src/dst. This is used during * IKE INITIAL-CONTACT. */ ipseclog((LOG_DEBUG, "%s: doing delete all.\n", __func__)); return key_delete_all(so, m, mhp, proto); } else if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA]; src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]); dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]); /* XXX boundary check against sa_len */ KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx); /* get a SA header */ SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0) continue; /* get a SA with SPI. */ sav = key_getsavbyspi(sah, sa0->sadb_sa_spi); if (sav) break; } if (sah == NULL) { SAHTREE_UNLOCK(); ipseclog((LOG_DEBUG, "%s: no SA found.\n", __func__)); return key_senderror(so, m, ENOENT); } key_sa_chgstate(sav, SADB_SASTATE_DEAD); SAHTREE_UNLOCK(); KEY_FREESAV(&sav); { struct mbuf *n; struct sadb_msg *newmsg; /* create new sadb_msg to reply. */ n = key_gather_mbuf(m, mhp, 1, 4, SADB_EXT_RESERVED, SADB_EXT_SA, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST); if (!n) return key_senderror(so, m, ENOBUFS); if (n->m_len < sizeof(struct sadb_msg)) { n = m_pullup(n, sizeof(struct sadb_msg)); if (n == NULL) return key_senderror(so, m, ENOBUFS); } newmsg = mtod(n, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len); m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ALL); } } /* * delete all SAs for src/dst. Called from key_delete(). */ static int key_delete_all(so, m, mhp, proto) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; u_int16_t proto; { INIT_VNET_IPSEC(curvnet); struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *sah; struct secasvar *sav, *nextsav; u_int stateidx, state; src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]); dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]); /* XXX boundary check against sa_len */ KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx); SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0) continue; /* Delete all non-LARVAL SAs. */ for (stateidx = 0; stateidx < _ARRAYLEN(V_saorder_state_alive); stateidx++) { state = V_saorder_state_alive[stateidx]; if (state == SADB_SASTATE_LARVAL) continue; for (sav = LIST_FIRST(&sah->savtree[state]); sav != NULL; sav = nextsav) { nextsav = LIST_NEXT(sav, chain); /* sanity check */ if (sav->state != state) { ipseclog((LOG_DEBUG, "%s: invalid " "sav->state (queue %d SA %d)\n", __func__, state, sav->state)); continue; } key_sa_chgstate(sav, SADB_SASTATE_DEAD); KEY_FREESAV(&sav); } } } SAHTREE_UNLOCK(); { struct mbuf *n; struct sadb_msg *newmsg; /* create new sadb_msg to reply. */ n = key_gather_mbuf(m, mhp, 1, 3, SADB_EXT_RESERVED, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST); if (!n) return key_senderror(so, m, ENOBUFS); if (n->m_len < sizeof(struct sadb_msg)) { n = m_pullup(n, sizeof(struct sadb_msg)); if (n == NULL) return key_senderror(so, m, ENOBUFS); } newmsg = mtod(n, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len); m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ALL); } } /* * SADB_GET processing * receive * * from the ikmpd, and get a SP and a SA to respond, * and send, * * to the ikmpd. * * m will always be freed. */ static int key_get(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); struct sadb_sa *sa0; struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *sah; struct secasvar *sav = NULL; u_int16_t proto; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_SA] == NULL || mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) || mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA]; src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC]; dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST]; /* XXX boundary check against sa_len */ KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx); /* get a SA header */ SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0) continue; /* get a SA with SPI. */ sav = key_getsavbyspi(sah, sa0->sadb_sa_spi); if (sav) break; } SAHTREE_UNLOCK(); if (sah == NULL) { ipseclog((LOG_DEBUG, "%s: no SA found.\n", __func__)); return key_senderror(so, m, ENOENT); } { struct mbuf *n; u_int8_t satype; /* map proto to satype */ if ((satype = key_proto2satype(sah->saidx.proto)) == 0) { ipseclog((LOG_DEBUG, "%s: there was invalid proto in SAD.\n", __func__)); return key_senderror(so, m, EINVAL); } /* create new sadb_msg to reply. */ n = key_setdumpsa(sav, SADB_GET, satype, mhp->msg->sadb_msg_seq, mhp->msg->sadb_msg_pid); if (!n) return key_senderror(so, m, ENOBUFS); m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ONE); } } /* XXX make it sysctl-configurable? */ static void key_getcomb_setlifetime(comb) struct sadb_comb *comb; { comb->sadb_comb_soft_allocations = 1; comb->sadb_comb_hard_allocations = 1; comb->sadb_comb_soft_bytes = 0; comb->sadb_comb_hard_bytes = 0; comb->sadb_comb_hard_addtime = 86400; /* 1 day */ comb->sadb_comb_soft_addtime = comb->sadb_comb_soft_addtime * 80 / 100; comb->sadb_comb_soft_usetime = 28800; /* 8 hours */ comb->sadb_comb_hard_usetime = comb->sadb_comb_hard_usetime * 80 / 100; } /* * XXX reorder combinations by preference * XXX no idea if the user wants ESP authentication or not */ static struct mbuf * key_getcomb_esp() { INIT_VNET_IPSEC(curvnet); struct sadb_comb *comb; struct enc_xform *algo; struct mbuf *result = NULL, *m, *n; int encmin; int i, off, o; int totlen; const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb)); m = NULL; for (i = 1; i <= SADB_EALG_MAX; i++) { algo = esp_algorithm_lookup(i); if (algo == NULL) continue; /* discard algorithms with key size smaller than system min */ if (_BITS(algo->maxkey) < V_ipsec_esp_keymin) continue; if (_BITS(algo->minkey) < V_ipsec_esp_keymin) encmin = V_ipsec_esp_keymin; else encmin = _BITS(algo->minkey); if (V_ipsec_esp_auth) m = key_getcomb_ah(); else { IPSEC_ASSERT(l <= MLEN, ("l=%u > MLEN=%lu", l, (u_long) MLEN)); MGET(m, M_DONTWAIT, MT_DATA); if (m) { M_ALIGN(m, l); m->m_len = l; m->m_next = NULL; bzero(mtod(m, caddr_t), m->m_len); } } if (!m) goto fail; totlen = 0; for (n = m; n; n = n->m_next) totlen += n->m_len; IPSEC_ASSERT((totlen % l) == 0, ("totlen=%u, l=%u", totlen, l)); for (off = 0; off < totlen; off += l) { n = m_pulldown(m, off, l, &o); if (!n) { /* m is already freed */ goto fail; } comb = (struct sadb_comb *)(mtod(n, caddr_t) + o); bzero(comb, sizeof(*comb)); key_getcomb_setlifetime(comb); comb->sadb_comb_encrypt = i; comb->sadb_comb_encrypt_minbits = encmin; comb->sadb_comb_encrypt_maxbits = _BITS(algo->maxkey); } if (!result) result = m; else m_cat(result, m); } return result; fail: if (result) m_freem(result); return NULL; } static void key_getsizes_ah( const struct auth_hash *ah, int alg, u_int16_t* min, u_int16_t* max) { INIT_VNET_IPSEC(curvnet); *min = *max = ah->keysize; if (ah->keysize == 0) { /* * Transform takes arbitrary key size but algorithm * key size is restricted. Enforce this here. */ switch (alg) { case SADB_X_AALG_MD5: *min = *max = 16; break; case SADB_X_AALG_SHA: *min = *max = 20; break; case SADB_X_AALG_NULL: *min = 1; *max = 256; break; default: DPRINTF(("%s: unknown AH algorithm %u\n", __func__, alg)); break; } } } /* * XXX reorder combinations by preference */ static struct mbuf * key_getcomb_ah() { INIT_VNET_IPSEC(curvnet); struct sadb_comb *comb; struct auth_hash *algo; struct mbuf *m; u_int16_t minkeysize, maxkeysize; int i; const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb)); m = NULL; for (i = 1; i <= SADB_AALG_MAX; i++) { #if 1 /* we prefer HMAC algorithms, not old algorithms */ if (i != SADB_AALG_SHA1HMAC && i != SADB_AALG_MD5HMAC) continue; #endif algo = ah_algorithm_lookup(i); if (!algo) continue; key_getsizes_ah(algo, i, &minkeysize, &maxkeysize); /* discard algorithms with key size smaller than system min */ if (_BITS(minkeysize) < V_ipsec_ah_keymin) continue; if (!m) { IPSEC_ASSERT(l <= MLEN, ("l=%u > MLEN=%lu", l, (u_long) MLEN)); MGET(m, M_DONTWAIT, MT_DATA); if (m) { M_ALIGN(m, l); m->m_len = l; m->m_next = NULL; } } else M_PREPEND(m, l, M_DONTWAIT); if (!m) return NULL; comb = mtod(m, struct sadb_comb *); bzero(comb, sizeof(*comb)); key_getcomb_setlifetime(comb); comb->sadb_comb_auth = i; comb->sadb_comb_auth_minbits = _BITS(minkeysize); comb->sadb_comb_auth_maxbits = _BITS(maxkeysize); } return m; } /* * not really an official behavior. discussed in pf_key@inner.net in Sep2000. * XXX reorder combinations by preference */ static struct mbuf * key_getcomb_ipcomp() { struct sadb_comb *comb; struct comp_algo *algo; struct mbuf *m; int i; const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb)); m = NULL; for (i = 1; i <= SADB_X_CALG_MAX; i++) { algo = ipcomp_algorithm_lookup(i); if (!algo) continue; if (!m) { IPSEC_ASSERT(l <= MLEN, ("l=%u > MLEN=%lu", l, (u_long) MLEN)); MGET(m, M_DONTWAIT, MT_DATA); if (m) { M_ALIGN(m, l); m->m_len = l; m->m_next = NULL; } } else M_PREPEND(m, l, M_DONTWAIT); if (!m) return NULL; comb = mtod(m, struct sadb_comb *); bzero(comb, sizeof(*comb)); key_getcomb_setlifetime(comb); comb->sadb_comb_encrypt = i; /* what should we set into sadb_comb_*_{min,max}bits? */ } return m; } /* * XXX no way to pass mode (transport/tunnel) to userland * XXX replay checking? * XXX sysctl interface to ipsec_{ah,esp}_keymin */ static struct mbuf * key_getprop(saidx) const struct secasindex *saidx; { struct sadb_prop *prop; struct mbuf *m, *n; const int l = PFKEY_ALIGN8(sizeof(struct sadb_prop)); int totlen; switch (saidx->proto) { case IPPROTO_ESP: m = key_getcomb_esp(); break; case IPPROTO_AH: m = key_getcomb_ah(); break; case IPPROTO_IPCOMP: m = key_getcomb_ipcomp(); break; default: return NULL; } if (!m) return NULL; M_PREPEND(m, l, M_DONTWAIT); if (!m) return NULL; totlen = 0; for (n = m; n; n = n->m_next) totlen += n->m_len; prop = mtod(m, struct sadb_prop *); bzero(prop, sizeof(*prop)); prop->sadb_prop_len = PFKEY_UNIT64(totlen); prop->sadb_prop_exttype = SADB_EXT_PROPOSAL; prop->sadb_prop_replay = 32; /* XXX */ return m; } /* * SADB_ACQUIRE processing called by key_checkrequest() and key_acquire2(). * send * * to KMD, and expect to receive * with SADB_ACQUIRE if error occured, * or * with SADB_GETSPI * from KMD by PF_KEY. * * XXX x_policy is outside of RFC2367 (KAME extension). * XXX sensitivity is not supported. * XXX for ipcomp, RFC2367 does not define how to fill in proposal. * see comment for key_getcomb_ipcomp(). * * OUT: * 0 : succeed * others: error number */ static int key_acquire(const struct secasindex *saidx, struct secpolicy *sp) { INIT_VNET_IPSEC(curvnet); struct mbuf *result = NULL, *m; struct secacq *newacq; u_int8_t satype; int error = -1; u_int32_t seq; IPSEC_ASSERT(saidx != NULL, ("null saidx")); satype = key_proto2satype(saidx->proto); IPSEC_ASSERT(satype != 0, ("null satype, protocol %u", saidx->proto)); /* * We never do anything about acquirng SA. There is anather * solution that kernel blocks to send SADB_ACQUIRE message until * getting something message from IKEd. In later case, to be * managed with ACQUIRING list. */ /* Get an entry to check whether sending message or not. */ if ((newacq = key_getacq(saidx)) != NULL) { if (V_key_blockacq_count < newacq->count) { /* reset counter and do send message. */ newacq->count = 0; } else { /* increment counter and do nothing. */ newacq->count++; return 0; } } else { /* make new entry for blocking to send SADB_ACQUIRE. */ if ((newacq = key_newacq(saidx)) == NULL) return ENOBUFS; } seq = newacq->seq; m = key_setsadbmsg(SADB_ACQUIRE, 0, satype, seq, 0, 0); if (!m) { error = ENOBUFS; goto fail; } result = m; /* set sadb_address for saidx's. */ m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, &saidx->src.sa, FULLMASK, IPSEC_ULPROTO_ANY); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, &saidx->dst.sa, FULLMASK, IPSEC_ULPROTO_ANY); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); /* XXX proxy address (optional) */ /* set sadb_x_policy */ if (sp) { m = key_setsadbxpolicy(sp->policy, sp->spidx.dir, sp->id); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); } /* XXX identity (optional) */ #if 0 if (idexttype && fqdn) { /* create identity extension (FQDN) */ struct sadb_ident *id; int fqdnlen; fqdnlen = strlen(fqdn) + 1; /* +1 for terminating-NUL */ id = (struct sadb_ident *)p; bzero(id, sizeof(*id) + PFKEY_ALIGN8(fqdnlen)); id->sadb_ident_len = PFKEY_UNIT64(sizeof(*id) + PFKEY_ALIGN8(fqdnlen)); id->sadb_ident_exttype = idexttype; id->sadb_ident_type = SADB_IDENTTYPE_FQDN; bcopy(fqdn, id + 1, fqdnlen); p += sizeof(struct sadb_ident) + PFKEY_ALIGN8(fqdnlen); } if (idexttype) { /* create identity extension (USERFQDN) */ struct sadb_ident *id; int userfqdnlen; if (userfqdn) { /* +1 for terminating-NUL */ userfqdnlen = strlen(userfqdn) + 1; } else userfqdnlen = 0; id = (struct sadb_ident *)p; bzero(id, sizeof(*id) + PFKEY_ALIGN8(userfqdnlen)); id->sadb_ident_len = PFKEY_UNIT64(sizeof(*id) + PFKEY_ALIGN8(userfqdnlen)); id->sadb_ident_exttype = idexttype; id->sadb_ident_type = SADB_IDENTTYPE_USERFQDN; /* XXX is it correct? */ if (curproc && curproc->p_cred) id->sadb_ident_id = curproc->p_cred->p_ruid; if (userfqdn && userfqdnlen) bcopy(userfqdn, id + 1, userfqdnlen); p += sizeof(struct sadb_ident) + PFKEY_ALIGN8(userfqdnlen); } #endif /* XXX sensitivity (optional) */ /* create proposal/combination extension */ m = key_getprop(saidx); #if 0 /* * spec conformant: always attach proposal/combination extension, * the problem is that we have no way to attach it for ipcomp, * due to the way sadb_comb is declared in RFC2367. */ if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); #else /* * outside of spec; make proposal/combination extension optional. */ if (m) m_cat(result, m); #endif if ((result->m_flags & M_PKTHDR) == 0) { error = EINVAL; goto fail; } if (result->m_len < sizeof(struct sadb_msg)) { result = m_pullup(result, sizeof(struct sadb_msg)); if (result == NULL) { error = ENOBUFS; goto fail; } } result->m_pkthdr.len = 0; for (m = result; m; m = m->m_next) result->m_pkthdr.len += m->m_len; mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED); fail: if (result) m_freem(result); return error; } static struct secacq * key_newacq(const struct secasindex *saidx) { INIT_VNET_IPSEC(curvnet); struct secacq *newacq; /* get new entry */ newacq = malloc(sizeof(struct secacq), M_IPSEC_SAQ, M_NOWAIT|M_ZERO); if (newacq == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return NULL; } /* copy secindex */ bcopy(saidx, &newacq->saidx, sizeof(newacq->saidx)); newacq->seq = (V_acq_seq == ~0 ? 1 : ++V_acq_seq); newacq->created = time_second; newacq->count = 0; /* add to acqtree */ ACQ_LOCK(); LIST_INSERT_HEAD(&V_acqtree, newacq, chain); ACQ_UNLOCK(); return newacq; } static struct secacq * key_getacq(const struct secasindex *saidx) { INIT_VNET_IPSEC(curvnet); struct secacq *acq; ACQ_LOCK(); LIST_FOREACH(acq, &V_acqtree, chain) { if (key_cmpsaidx(saidx, &acq->saidx, CMP_EXACTLY)) break; } ACQ_UNLOCK(); return acq; } static struct secacq * key_getacqbyseq(seq) u_int32_t seq; { INIT_VNET_IPSEC(curvnet); struct secacq *acq; ACQ_LOCK(); LIST_FOREACH(acq, &V_acqtree, chain) { if (acq->seq == seq) break; } ACQ_UNLOCK(); return acq; } static struct secspacq * key_newspacq(spidx) struct secpolicyindex *spidx; { INIT_VNET_IPSEC(curvnet); struct secspacq *acq; /* get new entry */ acq = malloc(sizeof(struct secspacq), M_IPSEC_SAQ, M_NOWAIT|M_ZERO); if (acq == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return NULL; } /* copy secindex */ bcopy(spidx, &acq->spidx, sizeof(acq->spidx)); acq->created = time_second; acq->count = 0; /* add to spacqtree */ SPACQ_LOCK(); LIST_INSERT_HEAD(&V_spacqtree, acq, chain); SPACQ_UNLOCK(); return acq; } static struct secspacq * key_getspacq(spidx) struct secpolicyindex *spidx; { INIT_VNET_IPSEC(curvnet); struct secspacq *acq; SPACQ_LOCK(); LIST_FOREACH(acq, &V_spacqtree, chain) { if (key_cmpspidx_exactly(spidx, &acq->spidx)) { /* NB: return holding spacq_lock */ return acq; } } SPACQ_UNLOCK(); return NULL; } /* * SADB_ACQUIRE processing, * in first situation, is receiving * * from the ikmpd, and clear sequence of its secasvar entry. * * In second situation, is receiving * * from a user land process, and return * * to the socket. * * m will always be freed. */ static int key_acquire2(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); const struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *sah; u_int16_t proto; int error; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* * Error message from KMd. * We assume that if error was occured in IKEd, the length of PFKEY * message is equal to the size of sadb_msg structure. * We do not raise error even if error occured in this function. */ if (mhp->msg->sadb_msg_len == PFKEY_UNIT64(sizeof(struct sadb_msg))) { struct secacq *acq; /* check sequence number */ if (mhp->msg->sadb_msg_seq == 0) { ipseclog((LOG_DEBUG, "%s: must specify sequence " "number.\n", __func__)); m_freem(m); return 0; } if ((acq = key_getacqbyseq(mhp->msg->sadb_msg_seq)) == NULL) { /* * the specified larval SA is already gone, or we got * a bogus sequence number. we can silently ignore it. */ m_freem(m); return 0; } /* reset acq counter in order to deletion by timehander. */ acq->created = time_second; acq->count = 0; m_freem(m); return 0; } /* * This message is from user land. */ /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL || mhp->ext[SADB_EXT_PROPOSAL] == NULL) { /* error */ ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_PROPOSAL] < sizeof(struct sadb_prop)) { /* error */ ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC]; dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST]; /* XXX boundary check against sa_len */ KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx); /* get a SA index */ SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, &saidx, CMP_MODE_REQID)) break; } SAHTREE_UNLOCK(); if (sah != NULL) { ipseclog((LOG_DEBUG, "%s: a SA exists already.\n", __func__)); return key_senderror(so, m, EEXIST); } error = key_acquire(&saidx, NULL); if (error != 0) { ipseclog((LOG_DEBUG, "%s: error %d returned from key_acquire\n", __func__, mhp->msg->sadb_msg_errno)); return key_senderror(so, m, error); } return key_sendup_mbuf(so, m, KEY_SENDUP_REGISTERED); } /* * SADB_REGISTER processing. * If SATYPE_UNSPEC has been passed as satype, only return sabd_supported. * receive * * from the ikmpd, and register a socket to send PF_KEY messages, * and send * * to KMD by PF_KEY. * If socket is detached, must free from regnode. * * m will always be freed. */ static int key_register(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); struct secreg *reg, *newreg = 0; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* check for invalid register message */ if (mhp->msg->sadb_msg_satype >= sizeof(V_regtree)/sizeof(V_regtree[0])) return key_senderror(so, m, EINVAL); /* When SATYPE_UNSPEC is specified, only return sabd_supported. */ if (mhp->msg->sadb_msg_satype == SADB_SATYPE_UNSPEC) goto setmsg; /* check whether existing or not */ REGTREE_LOCK(); LIST_FOREACH(reg, &V_regtree[mhp->msg->sadb_msg_satype], chain) { if (reg->so == so) { REGTREE_UNLOCK(); ipseclog((LOG_DEBUG, "%s: socket exists already.\n", __func__)); return key_senderror(so, m, EEXIST); } } /* create regnode */ newreg = malloc(sizeof(struct secreg), M_IPSEC_SAR, M_NOWAIT|M_ZERO); if (newreg == NULL) { REGTREE_UNLOCK(); ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return key_senderror(so, m, ENOBUFS); } newreg->so = so; ((struct keycb *)sotorawcb(so))->kp_registered++; /* add regnode to regtree. */ LIST_INSERT_HEAD(&V_regtree[mhp->msg->sadb_msg_satype], newreg, chain); REGTREE_UNLOCK(); setmsg: { struct mbuf *n; struct sadb_msg *newmsg; struct sadb_supported *sup; u_int len, alen, elen; int off; int i; struct sadb_alg *alg; /* create new sadb_msg to reply. */ alen = 0; for (i = 1; i <= SADB_AALG_MAX; i++) { if (ah_algorithm_lookup(i)) alen += sizeof(struct sadb_alg); } if (alen) alen += sizeof(struct sadb_supported); elen = 0; for (i = 1; i <= SADB_EALG_MAX; i++) { if (esp_algorithm_lookup(i)) elen += sizeof(struct sadb_alg); } if (elen) elen += sizeof(struct sadb_supported); len = sizeof(struct sadb_msg) + alen + elen; if (len > MCLBYTES) return key_senderror(so, m, ENOBUFS); MGETHDR(n, M_DONTWAIT, MT_DATA); if (len > MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; } } if (!n) return key_senderror(so, m, ENOBUFS); n->m_pkthdr.len = n->m_len = len; n->m_next = NULL; off = 0; m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off); newmsg = mtod(n, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(len); off += PFKEY_ALIGN8(sizeof(struct sadb_msg)); /* for authentication algorithm */ if (alen) { sup = (struct sadb_supported *)(mtod(n, caddr_t) + off); sup->sadb_supported_len = PFKEY_UNIT64(alen); sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH; off += PFKEY_ALIGN8(sizeof(*sup)); for (i = 1; i <= SADB_AALG_MAX; i++) { struct auth_hash *aalgo; u_int16_t minkeysize, maxkeysize; aalgo = ah_algorithm_lookup(i); if (!aalgo) continue; alg = (struct sadb_alg *)(mtod(n, caddr_t) + off); alg->sadb_alg_id = i; alg->sadb_alg_ivlen = 0; key_getsizes_ah(aalgo, i, &minkeysize, &maxkeysize); alg->sadb_alg_minbits = _BITS(minkeysize); alg->sadb_alg_maxbits = _BITS(maxkeysize); off += PFKEY_ALIGN8(sizeof(*alg)); } } /* for encryption algorithm */ if (elen) { sup = (struct sadb_supported *)(mtod(n, caddr_t) + off); sup->sadb_supported_len = PFKEY_UNIT64(elen); sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT; off += PFKEY_ALIGN8(sizeof(*sup)); for (i = 1; i <= SADB_EALG_MAX; i++) { struct enc_xform *ealgo; ealgo = esp_algorithm_lookup(i); if (!ealgo) continue; alg = (struct sadb_alg *)(mtod(n, caddr_t) + off); alg->sadb_alg_id = i; alg->sadb_alg_ivlen = ealgo->blocksize; alg->sadb_alg_minbits = _BITS(ealgo->minkey); alg->sadb_alg_maxbits = _BITS(ealgo->maxkey); off += PFKEY_ALIGN8(sizeof(struct sadb_alg)); } } IPSEC_ASSERT(off == len, ("length assumption failed (off %u len %u)", off, len)); m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_REGISTERED); } } /* * free secreg entry registered. * XXX: I want to do free a socket marked done SADB_RESIGER to socket. */ void key_freereg(struct socket *so) { INIT_VNET_IPSEC(curvnet); struct secreg *reg; int i; IPSEC_ASSERT(so != NULL, ("NULL so")); /* * check whether existing or not. * check all type of SA, because there is a potential that * one socket is registered to multiple type of SA. */ REGTREE_LOCK(); for (i = 0; i <= SADB_SATYPE_MAX; i++) { LIST_FOREACH(reg, &V_regtree[i], chain) { if (reg->so == so && __LIST_CHAINED(reg)) { LIST_REMOVE(reg, chain); free(reg, M_IPSEC_SAR); break; } } } REGTREE_UNLOCK(); } /* * SADB_EXPIRE processing * send * * to KMD by PF_KEY. * NOTE: We send only soft lifetime extension. * * OUT: 0 : succeed * others : error number */ static int key_expire(struct secasvar *sav) { int s; int satype; struct mbuf *result = NULL, *m; int len; int error = -1; struct sadb_lifetime *lt; /* XXX: Why do we lock ? */ s = splnet(); /*called from softclock()*/ IPSEC_ASSERT (sav != NULL, ("null sav")); IPSEC_ASSERT (sav->sah != NULL, ("null sa header")); /* set msg header */ satype = key_proto2satype(sav->sah->saidx.proto); IPSEC_ASSERT(satype != 0, ("invalid proto, satype %u", satype)); m = key_setsadbmsg(SADB_EXPIRE, 0, satype, sav->seq, 0, sav->refcnt); if (!m) { error = ENOBUFS; goto fail; } result = m; /* create SA extension */ m = key_setsadbsa(sav); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); /* create SA extension */ m = key_setsadbxsa2(sav->sah->saidx.mode, sav->replay ? sav->replay->count : 0, sav->sah->saidx.reqid); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); /* create lifetime extension (current and soft) */ len = PFKEY_ALIGN8(sizeof(*lt)) * 2; m = key_alloc_mbuf(len); if (!m || m->m_next) { /*XXX*/ if (m) m_freem(m); error = ENOBUFS; goto fail; } bzero(mtod(m, caddr_t), len); lt = mtod(m, struct sadb_lifetime *); lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime)); lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; lt->sadb_lifetime_allocations = sav->lft_c->allocations; lt->sadb_lifetime_bytes = sav->lft_c->bytes; lt->sadb_lifetime_addtime = sav->lft_c->addtime; lt->sadb_lifetime_usetime = sav->lft_c->usetime; lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2); lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime)); lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; lt->sadb_lifetime_allocations = sav->lft_s->allocations; lt->sadb_lifetime_bytes = sav->lft_s->bytes; lt->sadb_lifetime_addtime = sav->lft_s->addtime; lt->sadb_lifetime_usetime = sav->lft_s->usetime; m_cat(result, m); /* set sadb_address for source */ m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, &sav->sah->saidx.src.sa, FULLMASK, IPSEC_ULPROTO_ANY); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); /* set sadb_address for destination */ m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, &sav->sah->saidx.dst.sa, FULLMASK, IPSEC_ULPROTO_ANY); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); if ((result->m_flags & M_PKTHDR) == 0) { error = EINVAL; goto fail; } if (result->m_len < sizeof(struct sadb_msg)) { result = m_pullup(result, sizeof(struct sadb_msg)); if (result == NULL) { error = ENOBUFS; goto fail; } } result->m_pkthdr.len = 0; for (m = result; m; m = m->m_next) result->m_pkthdr.len += m->m_len; mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); splx(s); return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED); fail: if (result) m_freem(result); splx(s); return error; } /* * SADB_FLUSH processing * receive * * from the ikmpd, and free all entries in secastree. * and send, * * to the ikmpd. * NOTE: to do is only marking SADB_SASTATE_DEAD. * * m will always be freed. */ static int key_flush(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); struct sadb_msg *newmsg; struct secashead *sah, *nextsah; struct secasvar *sav, *nextsav; u_int16_t proto; u_int8_t state; u_int stateidx; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } /* no SATYPE specified, i.e. flushing all SA. */ SAHTREE_LOCK(); for (sah = LIST_FIRST(&V_sahtree); sah != NULL; sah = nextsah) { nextsah = LIST_NEXT(sah, chain); if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC && proto != sah->saidx.proto) continue; for (stateidx = 0; stateidx < _ARRAYLEN(V_saorder_state_alive); stateidx++) { state = V_saorder_state_any[stateidx]; for (sav = LIST_FIRST(&sah->savtree[state]); sav != NULL; sav = nextsav) { nextsav = LIST_NEXT(sav, chain); key_sa_chgstate(sav, SADB_SASTATE_DEAD); KEY_FREESAV(&sav); } } sah->state = SADB_SASTATE_DEAD; } SAHTREE_UNLOCK(); if (m->m_len < sizeof(struct sadb_msg) || sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return key_senderror(so, m, ENOBUFS); } if (m->m_next) m_freem(m->m_next); m->m_next = NULL; m->m_pkthdr.len = m->m_len = sizeof(struct sadb_msg); newmsg = mtod(m, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len); return key_sendup_mbuf(so, m, KEY_SENDUP_ALL); } /* * SADB_DUMP processing * dump all entries including status of DEAD in SAD. * receive * * from the ikmpd, and dump all secasvar leaves * and send, * ..... * to the ikmpd. * * m will always be freed. */ static int key_dump(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); struct secashead *sah; struct secasvar *sav; u_int16_t proto; u_int stateidx; u_int8_t satype; u_int8_t state; int cnt; struct sadb_msg *newmsg; struct mbuf *n; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } /* count sav entries to be sent to the userland. */ cnt = 0; SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC && proto != sah->saidx.proto) continue; for (stateidx = 0; stateidx < _ARRAYLEN(V_saorder_state_any); stateidx++) { state = V_saorder_state_any[stateidx]; LIST_FOREACH(sav, &sah->savtree[state], chain) { cnt++; } } } if (cnt == 0) { SAHTREE_UNLOCK(); return key_senderror(so, m, ENOENT); } /* send this to the userland, one at a time. */ newmsg = NULL; LIST_FOREACH(sah, &V_sahtree, chain) { if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC && proto != sah->saidx.proto) continue; /* map proto to satype */ if ((satype = key_proto2satype(sah->saidx.proto)) == 0) { SAHTREE_UNLOCK(); ipseclog((LOG_DEBUG, "%s: there was invalid proto in " "SAD.\n", __func__)); return key_senderror(so, m, EINVAL); } for (stateidx = 0; stateidx < _ARRAYLEN(V_saorder_state_any); stateidx++) { state = V_saorder_state_any[stateidx]; LIST_FOREACH(sav, &sah->savtree[state], chain) { n = key_setdumpsa(sav, SADB_DUMP, satype, --cnt, mhp->msg->sadb_msg_pid); if (!n) { SAHTREE_UNLOCK(); return key_senderror(so, m, ENOBUFS); } key_sendup_mbuf(so, n, KEY_SENDUP_ONE); } } } SAHTREE_UNLOCK(); m_freem(m); return 0; } /* * SADB_X_PROMISC processing * * m will always be freed. */ static int key_promisc(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { int olen; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); olen = PFKEY_UNUNIT64(mhp->msg->sadb_msg_len); if (olen < sizeof(struct sadb_msg)) { #if 1 return key_senderror(so, m, EINVAL); #else m_freem(m); return 0; #endif } else if (olen == sizeof(struct sadb_msg)) { /* enable/disable promisc mode */ struct keycb *kp; if ((kp = (struct keycb *)sotorawcb(so)) == NULL) return key_senderror(so, m, EINVAL); mhp->msg->sadb_msg_errno = 0; switch (mhp->msg->sadb_msg_satype) { case 0: case 1: kp->kp_promisc = mhp->msg->sadb_msg_satype; break; default: return key_senderror(so, m, EINVAL); } /* send the original message back to everyone */ mhp->msg->sadb_msg_errno = 0; return key_sendup_mbuf(so, m, KEY_SENDUP_ALL); } else { /* send packet as is */ m_adj(m, PFKEY_ALIGN8(sizeof(struct sadb_msg))); /* TODO: if sadb_msg_seq is specified, send to specific pid */ return key_sendup_mbuf(so, m, KEY_SENDUP_ALL); } } static int (*key_typesw[]) __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)) = { NULL, /* SADB_RESERVED */ key_getspi, /* SADB_GETSPI */ key_update, /* SADB_UPDATE */ key_add, /* SADB_ADD */ key_delete, /* SADB_DELETE */ key_get, /* SADB_GET */ key_acquire2, /* SADB_ACQUIRE */ key_register, /* SADB_REGISTER */ NULL, /* SADB_EXPIRE */ key_flush, /* SADB_FLUSH */ key_dump, /* SADB_DUMP */ key_promisc, /* SADB_X_PROMISC */ NULL, /* SADB_X_PCHANGE */ key_spdadd, /* SADB_X_SPDUPDATE */ key_spdadd, /* SADB_X_SPDADD */ key_spddelete, /* SADB_X_SPDDELETE */ key_spdget, /* SADB_X_SPDGET */ NULL, /* SADB_X_SPDACQUIRE */ key_spddump, /* SADB_X_SPDDUMP */ key_spdflush, /* SADB_X_SPDFLUSH */ key_spdadd, /* SADB_X_SPDSETIDX */ NULL, /* SADB_X_SPDEXPIRE */ key_spddelete2, /* SADB_X_SPDDELETE2 */ }; /* * parse sadb_msg buffer to process PFKEYv2, * and create a data to response if needed. * I think to be dealed with mbuf directly. * IN: * msgp : pointer to pointer to a received buffer pulluped. * This is rewrited to response. * so : pointer to socket. * OUT: * length for buffer to send to user process. */ int key_parse(m, so) struct mbuf *m; struct socket *so; { INIT_VNET_IPSEC(curvnet); struct sadb_msg *msg; struct sadb_msghdr mh; u_int orglen; int error; int target; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); #if 0 /*kdebug_sadb assumes msg in linear buffer*/ KEYDEBUG(KEYDEBUG_KEY_DUMP, ipseclog((LOG_DEBUG, "%s: passed sadb_msg\n", __func__)); kdebug_sadb(msg)); #endif if (m->m_len < sizeof(struct sadb_msg)) { m = m_pullup(m, sizeof(struct sadb_msg)); if (!m) return ENOBUFS; } msg = mtod(m, struct sadb_msg *); orglen = PFKEY_UNUNIT64(msg->sadb_msg_len); target = KEY_SENDUP_ONE; if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len != m->m_pkthdr.len) { ipseclog((LOG_DEBUG, "%s: invalid message length.\n",__func__)); V_pfkeystat.out_invlen++; error = EINVAL; goto senderror; } if (msg->sadb_msg_version != PF_KEY_V2) { ipseclog((LOG_DEBUG, "%s: PF_KEY version %u is mismatched.\n", __func__, msg->sadb_msg_version)); V_pfkeystat.out_invver++; error = EINVAL; goto senderror; } if (msg->sadb_msg_type > SADB_MAX) { ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n", __func__, msg->sadb_msg_type)); V_pfkeystat.out_invmsgtype++; error = EINVAL; goto senderror; } /* for old-fashioned code - should be nuked */ if (m->m_pkthdr.len > MCLBYTES) { m_freem(m); return ENOBUFS; } if (m->m_next) { struct mbuf *n; MGETHDR(n, M_DONTWAIT, MT_DATA); if (n && m->m_pkthdr.len > MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (!n) { m_freem(m); return ENOBUFS; } m_copydata(m, 0, m->m_pkthdr.len, mtod(n, caddr_t)); n->m_pkthdr.len = n->m_len = m->m_pkthdr.len; n->m_next = NULL; m_freem(m); m = n; } /* align the mbuf chain so that extensions are in contiguous region. */ error = key_align(m, &mh); if (error) return error; msg = mh.msg; /* check SA type */ switch (msg->sadb_msg_satype) { case SADB_SATYPE_UNSPEC: switch (msg->sadb_msg_type) { case SADB_GETSPI: case SADB_UPDATE: case SADB_ADD: case SADB_DELETE: case SADB_GET: case SADB_ACQUIRE: case SADB_EXPIRE: ipseclog((LOG_DEBUG, "%s: must specify satype " "when msg type=%u.\n", __func__, msg->sadb_msg_type)); V_pfkeystat.out_invsatype++; error = EINVAL; goto senderror; } break; case SADB_SATYPE_AH: case SADB_SATYPE_ESP: case SADB_X_SATYPE_IPCOMP: case SADB_X_SATYPE_TCPSIGNATURE: switch (msg->sadb_msg_type) { case SADB_X_SPDADD: case SADB_X_SPDDELETE: case SADB_X_SPDGET: case SADB_X_SPDDUMP: case SADB_X_SPDFLUSH: case SADB_X_SPDSETIDX: case SADB_X_SPDUPDATE: case SADB_X_SPDDELETE2: ipseclog((LOG_DEBUG, "%s: illegal satype=%u\n", __func__, msg->sadb_msg_type)); V_pfkeystat.out_invsatype++; error = EINVAL; goto senderror; } break; case SADB_SATYPE_RSVP: case SADB_SATYPE_OSPFV2: case SADB_SATYPE_RIPV2: case SADB_SATYPE_MIP: ipseclog((LOG_DEBUG, "%s: type %u isn't supported.\n", __func__, msg->sadb_msg_satype)); V_pfkeystat.out_invsatype++; error = EOPNOTSUPP; goto senderror; case 1: /* XXX: What does it do? */ if (msg->sadb_msg_type == SADB_X_PROMISC) break; /*FALLTHROUGH*/ default: ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n", __func__, msg->sadb_msg_satype)); V_pfkeystat.out_invsatype++; error = EINVAL; goto senderror; } /* check field of upper layer protocol and address family */ if (mh.ext[SADB_EXT_ADDRESS_SRC] != NULL && mh.ext[SADB_EXT_ADDRESS_DST] != NULL) { struct sadb_address *src0, *dst0; u_int plen; src0 = (struct sadb_address *)(mh.ext[SADB_EXT_ADDRESS_SRC]); dst0 = (struct sadb_address *)(mh.ext[SADB_EXT_ADDRESS_DST]); /* check upper layer protocol */ if (src0->sadb_address_proto != dst0->sadb_address_proto) { ipseclog((LOG_DEBUG, "%s: upper layer protocol " "mismatched.\n", __func__)); V_pfkeystat.out_invaddr++; error = EINVAL; goto senderror; } /* check family */ if (PFKEY_ADDR_SADDR(src0)->sa_family != PFKEY_ADDR_SADDR(dst0)->sa_family) { ipseclog((LOG_DEBUG, "%s: address family mismatched.\n", __func__)); V_pfkeystat.out_invaddr++; error = EINVAL; goto senderror; } if (PFKEY_ADDR_SADDR(src0)->sa_len != PFKEY_ADDR_SADDR(dst0)->sa_len) { ipseclog((LOG_DEBUG, "%s: address struct size " "mismatched.\n", __func__)); V_pfkeystat.out_invaddr++; error = EINVAL; goto senderror; } switch (PFKEY_ADDR_SADDR(src0)->sa_family) { case AF_INET: if (PFKEY_ADDR_SADDR(src0)->sa_len != sizeof(struct sockaddr_in)) { V_pfkeystat.out_invaddr++; error = EINVAL; goto senderror; } break; case AF_INET6: if (PFKEY_ADDR_SADDR(src0)->sa_len != sizeof(struct sockaddr_in6)) { V_pfkeystat.out_invaddr++; error = EINVAL; goto senderror; } break; default: ipseclog((LOG_DEBUG, "%s: unsupported address family\n", __func__)); V_pfkeystat.out_invaddr++; error = EAFNOSUPPORT; goto senderror; } switch (PFKEY_ADDR_SADDR(src0)->sa_family) { case AF_INET: plen = sizeof(struct in_addr) << 3; break; case AF_INET6: plen = sizeof(struct in6_addr) << 3; break; default: plen = 0; /*fool gcc*/ break; } /* check max prefix length */ if (src0->sadb_address_prefixlen > plen || dst0->sadb_address_prefixlen > plen) { ipseclog((LOG_DEBUG, "%s: illegal prefixlen.\n", __func__)); V_pfkeystat.out_invaddr++; error = EINVAL; goto senderror; } /* * prefixlen == 0 is valid because there can be a case when * all addresses are matched. */ } if (msg->sadb_msg_type >= sizeof(key_typesw)/sizeof(key_typesw[0]) || key_typesw[msg->sadb_msg_type] == NULL) { V_pfkeystat.out_invmsgtype++; error = EINVAL; goto senderror; } return (*key_typesw[msg->sadb_msg_type])(so, m, &mh); senderror: msg->sadb_msg_errno = error; return key_sendup_mbuf(so, m, target); } static int key_senderror(so, m, code) struct socket *so; struct mbuf *m; int code; { struct sadb_msg *msg; IPSEC_ASSERT(m->m_len >= sizeof(struct sadb_msg), ("mbuf too small, len %u", m->m_len)); msg = mtod(m, struct sadb_msg *); msg->sadb_msg_errno = code; return key_sendup_mbuf(so, m, KEY_SENDUP_ONE); } /* * set the pointer to each header into message buffer. * m will be freed on error. * XXX larger-than-MCLBYTES extension? */ static int key_align(m, mhp) struct mbuf *m; struct sadb_msghdr *mhp; { INIT_VNET_IPSEC(curvnet); struct mbuf *n; struct sadb_ext *ext; size_t off, end; int extlen; int toff; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(m->m_len >= sizeof(struct sadb_msg), ("mbuf too small, len %u", m->m_len)); /* initialize */ bzero(mhp, sizeof(*mhp)); mhp->msg = mtod(m, struct sadb_msg *); mhp->ext[0] = (struct sadb_ext *)mhp->msg; /*XXX backward compat */ end = PFKEY_UNUNIT64(mhp->msg->sadb_msg_len); extlen = end; /*just in case extlen is not updated*/ for (off = sizeof(struct sadb_msg); off < end; off += extlen) { n = m_pulldown(m, off, sizeof(struct sadb_ext), &toff); if (!n) { /* m is already freed */ return ENOBUFS; } ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff); /* set pointer */ switch (ext->sadb_ext_type) { case SADB_EXT_SA: case SADB_EXT_ADDRESS_SRC: case SADB_EXT_ADDRESS_DST: case SADB_EXT_ADDRESS_PROXY: case SADB_EXT_LIFETIME_CURRENT: case SADB_EXT_LIFETIME_HARD: case SADB_EXT_LIFETIME_SOFT: case SADB_EXT_KEY_AUTH: case SADB_EXT_KEY_ENCRYPT: case SADB_EXT_IDENTITY_SRC: case SADB_EXT_IDENTITY_DST: case SADB_EXT_SENSITIVITY: case SADB_EXT_PROPOSAL: case SADB_EXT_SUPPORTED_AUTH: case SADB_EXT_SUPPORTED_ENCRYPT: case SADB_EXT_SPIRANGE: case SADB_X_EXT_POLICY: case SADB_X_EXT_SA2: /* duplicate check */ /* * XXX Are there duplication payloads of either * KEY_AUTH or KEY_ENCRYPT ? */ if (mhp->ext[ext->sadb_ext_type] != NULL) { ipseclog((LOG_DEBUG, "%s: duplicate ext_type " "%u\n", __func__, ext->sadb_ext_type)); m_freem(m); V_pfkeystat.out_dupext++; return EINVAL; } break; default: ipseclog((LOG_DEBUG, "%s: invalid ext_type %u\n", __func__, ext->sadb_ext_type)); m_freem(m); V_pfkeystat.out_invexttype++; return EINVAL; } extlen = PFKEY_UNUNIT64(ext->sadb_ext_len); if (key_validate_ext(ext, extlen)) { m_freem(m); V_pfkeystat.out_invlen++; return EINVAL; } n = m_pulldown(m, off, extlen, &toff); if (!n) { /* m is already freed */ return ENOBUFS; } ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff); mhp->ext[ext->sadb_ext_type] = ext; mhp->extoff[ext->sadb_ext_type] = off; mhp->extlen[ext->sadb_ext_type] = extlen; } if (off != end) { m_freem(m); V_pfkeystat.out_invlen++; return EINVAL; } return 0; } static int key_validate_ext(ext, len) const struct sadb_ext *ext; int len; { const struct sockaddr *sa; enum { NONE, ADDR } checktype = NONE; int baselen = 0; const int sal = offsetof(struct sockaddr, sa_len) + sizeof(sa->sa_len); if (len != PFKEY_UNUNIT64(ext->sadb_ext_len)) return EINVAL; /* if it does not match minimum/maximum length, bail */ if (ext->sadb_ext_type >= sizeof(minsize) / sizeof(minsize[0]) || ext->sadb_ext_type >= sizeof(maxsize) / sizeof(maxsize[0])) return EINVAL; if (!minsize[ext->sadb_ext_type] || len < minsize[ext->sadb_ext_type]) return EINVAL; if (maxsize[ext->sadb_ext_type] && len > maxsize[ext->sadb_ext_type]) return EINVAL; /* more checks based on sadb_ext_type XXX need more */ switch (ext->sadb_ext_type) { case SADB_EXT_ADDRESS_SRC: case SADB_EXT_ADDRESS_DST: case SADB_EXT_ADDRESS_PROXY: baselen = PFKEY_ALIGN8(sizeof(struct sadb_address)); checktype = ADDR; break; case SADB_EXT_IDENTITY_SRC: case SADB_EXT_IDENTITY_DST: if (((const struct sadb_ident *)ext)->sadb_ident_type == SADB_X_IDENTTYPE_ADDR) { baselen = PFKEY_ALIGN8(sizeof(struct sadb_ident)); checktype = ADDR; } else checktype = NONE; break; default: checktype = NONE; break; } switch (checktype) { case NONE: break; case ADDR: sa = (const struct sockaddr *)(((const u_int8_t*)ext)+baselen); if (len < baselen + sal) return EINVAL; if (baselen + PFKEY_ALIGN8(sa->sa_len) != len) return EINVAL; break; } return 0; } void key_init(void) { INIT_VNET_IPSEC(curvnet); int i; + + V_key_debug_level = 0; + V_key_spi_trycnt = 1000; + V_key_spi_minval = 0x100; + V_key_spi_maxval = 0x0fffffff; /* XXX */ + V_policy_id = 0; + V_key_int_random = 60; /*interval to initialize randseed,1(m)*/ + V_key_larval_lifetime = 30; /* interval to expire acquiring, 30(s)*/ + V_key_blockacq_count = 10; /* counter for blocking SADB_ACQUIRE.*/ + V_key_blockacq_lifetime = 20; /* lifetime for blocking SADB_ACQUIRE.*/ + V_key_preferred_oldsa = 1; /* preferred old sa rather than new sa*/ + + V_acq_seq = 0; + + V_ipsec_esp_keymin = 256; + V_ipsec_esp_auth = 0; + V_ipsec_ah_keymin = 128; SPTREE_LOCK_INIT(); REGTREE_LOCK_INIT(); SAHTREE_LOCK_INIT(); ACQ_LOCK_INIT(); SPACQ_LOCK_INIT(); for (i = 0; i < IPSEC_DIR_MAX; i++) LIST_INIT(&V_sptree[i]); LIST_INIT(&V_sahtree); for (i = 0; i <= SADB_SATYPE_MAX; i++) LIST_INIT(&V_regtree[i]); LIST_INIT(&V_acqtree); LIST_INIT(&V_spacqtree); /* system default */ V_ip4_def_policy.policy = IPSEC_POLICY_NONE; V_ip4_def_policy.refcnt++; /*never reclaim this*/ #ifndef IPSEC_DEBUG2 timeout((void *)key_timehandler, (void *)0, hz); #endif /*IPSEC_DEBUG2*/ /* initialize key statistics */ keystat.getspi_count = 1; printf("IPsec: Initialized Security Association Processing.\n"); return; } /* * XXX: maybe This function is called after INBOUND IPsec processing. * * Special check for tunnel-mode packets. * We must make some checks for consistency between inner and outer IP header. * * xxx more checks to be provided */ int key_checktunnelsanity(sav, family, src, dst) struct secasvar *sav; u_int family; caddr_t src; caddr_t dst; { IPSEC_ASSERT(sav->sah != NULL, ("null SA header")); /* XXX: check inner IP header */ return 1; } /* record data transfer on SA, and update timestamps */ void key_sa_recordxfer(sav, m) struct secasvar *sav; struct mbuf *m; { IPSEC_ASSERT(sav != NULL, ("Null secasvar")); IPSEC_ASSERT(m != NULL, ("Null mbuf")); if (!sav->lft_c) return; /* * XXX Currently, there is a difference of bytes size * between inbound and outbound processing. */ sav->lft_c->bytes += m->m_pkthdr.len; /* to check bytes lifetime is done in key_timehandler(). */ /* * We use the number of packets as the unit of * allocations. We increment the variable * whenever {esp,ah}_{in,out}put is called. */ sav->lft_c->allocations++; /* XXX check for expires? */ /* * NOTE: We record CURRENT usetime by using wall clock, * in seconds. HARD and SOFT lifetime are measured by the time * difference (again in seconds) from usetime. * * usetime * v expire expire * -----+-----+--------+---> t * <--------------> HARD * <-----> SOFT */ sav->lft_c->usetime = time_second; /* XXX check for expires? */ return; } /* dumb version */ void key_sa_routechange(dst) struct sockaddr *dst; { INIT_VNET_IPSEC(curvnet); struct secashead *sah; struct route *ro; SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { ro = &sah->sa_route; if (ro->ro_rt && dst->sa_len == ro->ro_dst.sa_len && bcmp(dst, &ro->ro_dst, dst->sa_len) == 0) { RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)NULL; } } SAHTREE_UNLOCK(); } static void key_sa_chgstate(sav, state) struct secasvar *sav; u_int8_t state; { IPSEC_ASSERT(sav != NULL, ("NULL sav")); SAHTREE_LOCK_ASSERT(); if (sav->state != state) { if (__LIST_CHAINED(sav)) LIST_REMOVE(sav, chain); sav->state = state; LIST_INSERT_HEAD(&sav->sah->savtree[state], sav, chain); } } void key_sa_stir_iv(sav) struct secasvar *sav; { IPSEC_ASSERT(sav->iv != NULL, ("null IV")); key_randomfill(sav->iv, sav->ivlen); } /* XXX too much? */ static struct mbuf * key_alloc_mbuf(l) int l; { struct mbuf *m = NULL, *n; int len, t; len = l; while (len > 0) { MGET(n, M_DONTWAIT, MT_DATA); if (n && len > MLEN) MCLGET(n, M_DONTWAIT); if (!n) { m_freem(m); return NULL; } n->m_next = NULL; n->m_len = 0; n->m_len = M_TRAILINGSPACE(n); /* use the bottom of mbuf, hoping we can prepend afterwards */ if (n->m_len > len) { t = (n->m_len - len) & ~(sizeof(long) - 1); n->m_data += t; n->m_len = len; } len -= n->m_len; if (m) m_cat(m, n); else m = n; } return m; } /* * Take one of the kernel's security keys and convert it into a PF_KEY * structure within an mbuf, suitable for sending up to a waiting * application in user land. * * IN: * src: A pointer to a kernel security key. * exttype: Which type of key this is. Refer to the PF_KEY data structures. * OUT: * a valid mbuf or NULL indicating an error * */ static struct mbuf * key_setkey(struct seckey *src, u_int16_t exttype) { struct mbuf *m; struct sadb_key *p; int len; if (src == NULL) return NULL; len = PFKEY_ALIGN8(sizeof(struct sadb_key) + _KEYLEN(src)); m = key_alloc_mbuf(len); if (m == NULL) return NULL; p = mtod(m, struct sadb_key *); bzero(p, len); p->sadb_key_len = PFKEY_UNIT64(len); p->sadb_key_exttype = exttype; p->sadb_key_bits = src->bits; bcopy(src->key_data, _KEYBUF(p), _KEYLEN(src)); return m; } /* * Take one of the kernel's lifetime data structures and convert it * into a PF_KEY structure within an mbuf, suitable for sending up to * a waiting application in user land. * * IN: * src: A pointer to a kernel lifetime structure. * exttype: Which type of lifetime this is. Refer to the PF_KEY * data structures for more information. * OUT: * a valid mbuf or NULL indicating an error * */ static struct mbuf * key_setlifetime(struct seclifetime *src, u_int16_t exttype) { struct mbuf *m = NULL; struct sadb_lifetime *p; int len = PFKEY_ALIGN8(sizeof(struct sadb_lifetime)); if (src == NULL) return NULL; m = key_alloc_mbuf(len); if (m == NULL) return m; p = mtod(m, struct sadb_lifetime *); bzero(p, len); p->sadb_lifetime_len = PFKEY_UNIT64(len); p->sadb_lifetime_exttype = exttype; p->sadb_lifetime_allocations = src->allocations; p->sadb_lifetime_bytes = src->bytes; p->sadb_lifetime_addtime = src->addtime; p->sadb_lifetime_usetime = src->usetime; return m; } Index: head/sys/netipsec/keysock.c =================================================================== --- head/sys/netipsec/keysock.c (revision 185087) +++ head/sys/netipsec/keysock.c (revision 185088) @@ -1,585 +1,589 @@ /* $FreeBSD$ */ /* $KAME: keysock.c,v 1.25 2001/08/13 20:07:41 itojun Exp $ */ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_ipsec.h" /* This code has derived from sys/net/rtsock.c on FreeBSD2.2.5 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct key_cb { int key_count; int any_count; }; + +#ifdef VIMAGE_GLOBALS static struct key_cb key_cb; +struct pfkeystat pfkeystat; +#endif static struct sockaddr key_src = { 2, PF_KEY, }; static int key_sendup0 __P((struct rawcb *, struct mbuf *, int)); -struct pfkeystat pfkeystat; - /* * key_output() */ int key_output(struct mbuf *m, struct socket *so) { INIT_VNET_IPSEC(curvnet); struct sadb_msg *msg; int len, error = 0; if (m == 0) panic("%s: NULL pointer was passed.\n", __func__); V_pfkeystat.out_total++; V_pfkeystat.out_bytes += m->m_pkthdr.len; len = m->m_pkthdr.len; if (len < sizeof(struct sadb_msg)) { V_pfkeystat.out_tooshort++; error = EINVAL; goto end; } if (m->m_len < sizeof(struct sadb_msg)) { if ((m = m_pullup(m, sizeof(struct sadb_msg))) == 0) { V_pfkeystat.out_nomem++; error = ENOBUFS; goto end; } } M_ASSERTPKTHDR(m); KEYDEBUG(KEYDEBUG_KEY_DUMP, kdebug_mbuf(m)); msg = mtod(m, struct sadb_msg *); V_pfkeystat.out_msgtype[msg->sadb_msg_type]++; if (len != PFKEY_UNUNIT64(msg->sadb_msg_len)) { V_pfkeystat.out_invlen++; error = EINVAL; goto end; } error = key_parse(m, so); m = NULL; end: if (m) m_freem(m); return error; } /* * send message to the socket. */ static int key_sendup0(rp, m, promisc) struct rawcb *rp; struct mbuf *m; int promisc; { INIT_VNET_IPSEC(curvnet); int error; if (promisc) { struct sadb_msg *pmsg; M_PREPEND(m, sizeof(struct sadb_msg), M_DONTWAIT); if (m && m->m_len < sizeof(struct sadb_msg)) m = m_pullup(m, sizeof(struct sadb_msg)); if (!m) { V_pfkeystat.in_nomem++; m_freem(m); return ENOBUFS; } m->m_pkthdr.len += sizeof(*pmsg); pmsg = mtod(m, struct sadb_msg *); bzero(pmsg, sizeof(*pmsg)); pmsg->sadb_msg_version = PF_KEY_V2; pmsg->sadb_msg_type = SADB_X_PROMISC; pmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len); /* pid and seq? */ V_pfkeystat.in_msgtype[pmsg->sadb_msg_type]++; } if (!sbappendaddr(&rp->rcb_socket->so_rcv, (struct sockaddr *)&V_key_src, m, NULL)) { V_pfkeystat.in_nomem++; m_freem(m); error = ENOBUFS; } else error = 0; sorwakeup(rp->rcb_socket); return error; } /* XXX this interface should be obsoleted. */ int key_sendup(so, msg, len, target) struct socket *so; struct sadb_msg *msg; u_int len; int target; /*target of the resulting message*/ { INIT_VNET_IPSEC(curvnet); struct mbuf *m, *n, *mprev; int tlen; /* sanity check */ if (so == 0 || msg == 0) panic("%s: NULL pointer was passed.\n", __func__); KEYDEBUG(KEYDEBUG_KEY_DUMP, printf("%s: \n", __func__); kdebug_sadb(msg)); /* * we increment statistics here, just in case we have ENOBUFS * in this function. */ V_pfkeystat.in_total++; V_pfkeystat.in_bytes += len; V_pfkeystat.in_msgtype[msg->sadb_msg_type]++; /* * Get mbuf chain whenever possible (not clusters), * to save socket buffer. We'll be generating many SADB_ACQUIRE * messages to listening key sockets. If we simply allocate clusters, * sbappendaddr() will raise ENOBUFS due to too little sbspace(). * sbspace() computes # of actual data bytes AND mbuf region. * * TODO: SADB_ACQUIRE filters should be implemented. */ tlen = len; m = mprev = NULL; while (tlen > 0) { if (tlen == len) { MGETHDR(n, M_DONTWAIT, MT_DATA); if (n == NULL) { V_pfkeystat.in_nomem++; return ENOBUFS; } n->m_len = MHLEN; } else { MGET(n, M_DONTWAIT, MT_DATA); if (n == NULL) { V_pfkeystat.in_nomem++; return ENOBUFS; } n->m_len = MLEN; } if (tlen >= MCLBYTES) { /*XXX better threshold? */ MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); m_freem(m); V_pfkeystat.in_nomem++; return ENOBUFS; } n->m_len = MCLBYTES; } if (tlen < n->m_len) n->m_len = tlen; n->m_next = NULL; if (m == NULL) m = mprev = n; else { mprev->m_next = n; mprev = n; } tlen -= n->m_len; n = NULL; } m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m_copyback(m, 0, len, (caddr_t)msg); /* avoid duplicated statistics */ V_pfkeystat.in_total--; V_pfkeystat.in_bytes -= len; V_pfkeystat.in_msgtype[msg->sadb_msg_type]--; return key_sendup_mbuf(so, m, target); } /* so can be NULL if target != KEY_SENDUP_ONE */ int key_sendup_mbuf(so, m, target) struct socket *so; struct mbuf *m; int target; { INIT_VNET_NET(curvnet); INIT_VNET_IPSEC(curvnet); struct mbuf *n; struct keycb *kp; int sendup; struct rawcb *rp; int error = 0; if (m == NULL) panic("key_sendup_mbuf: NULL pointer was passed.\n"); if (so == NULL && target == KEY_SENDUP_ONE) panic("%s: NULL pointer was passed.\n", __func__); V_pfkeystat.in_total++; V_pfkeystat.in_bytes += m->m_pkthdr.len; if (m->m_len < sizeof(struct sadb_msg)) { m = m_pullup(m, sizeof(struct sadb_msg)); if (m == NULL) { V_pfkeystat.in_nomem++; return ENOBUFS; } } if (m->m_len >= sizeof(struct sadb_msg)) { struct sadb_msg *msg; msg = mtod(m, struct sadb_msg *); V_pfkeystat.in_msgtype[msg->sadb_msg_type]++; } mtx_lock(&rawcb_mtx); LIST_FOREACH(rp, &V_rawcb_list, list) { if (rp->rcb_proto.sp_family != PF_KEY) continue; if (rp->rcb_proto.sp_protocol && rp->rcb_proto.sp_protocol != PF_KEY_V2) { continue; } kp = (struct keycb *)rp; /* * If you are in promiscuous mode, and when you get broadcasted * reply, you'll get two PF_KEY messages. * (based on pf_key@inner.net message on 14 Oct 1998) */ if (((struct keycb *)rp)->kp_promisc) { if ((n = m_copy(m, 0, (int)M_COPYALL)) != NULL) { (void)key_sendup0(rp, n, 1); n = NULL; } } /* the exact target will be processed later */ if (so && sotorawcb(so) == rp) continue; sendup = 0; switch (target) { case KEY_SENDUP_ONE: /* the statement has no effect */ if (so && sotorawcb(so) == rp) sendup++; break; case KEY_SENDUP_ALL: sendup++; break; case KEY_SENDUP_REGISTERED: if (kp->kp_registered) sendup++; break; } V_pfkeystat.in_msgtarget[target]++; if (!sendup) continue; if ((n = m_copy(m, 0, (int)M_COPYALL)) == NULL) { m_freem(m); V_pfkeystat.in_nomem++; mtx_unlock(&rawcb_mtx); return ENOBUFS; } if ((error = key_sendup0(rp, n, 0)) != 0) { m_freem(m); mtx_unlock(&rawcb_mtx); return error; } n = NULL; } if (so) { error = key_sendup0(sotorawcb(so), m, 0); m = NULL; } else { error = 0; m_freem(m); } mtx_unlock(&rawcb_mtx); return error; } /* * key_abort() * derived from net/rtsock.c:rts_abort() */ static void key_abort(struct socket *so) { raw_usrreqs.pru_abort(so); } /* * key_attach() * derived from net/rtsock.c:rts_attach() */ static int key_attach(struct socket *so, int proto, struct thread *td) { INIT_VNET_IPSEC(curvnet); struct keycb *kp; int error; KASSERT(so->so_pcb == NULL, ("key_attach: so_pcb != NULL")); if (td != NULL) { error = priv_check(td, PRIV_NET_RAW); if (error) return error; } /* XXX */ kp = malloc(sizeof *kp, M_PCB, M_WAITOK | M_ZERO); if (kp == 0) return ENOBUFS; so->so_pcb = (caddr_t)kp; error = raw_attach(so, proto); kp = (struct keycb *)sotorawcb(so); if (error) { free(kp, M_PCB); so->so_pcb = (caddr_t) 0; return error; } kp->kp_promisc = kp->kp_registered = 0; if (kp->kp_raw.rcb_proto.sp_protocol == PF_KEY) /* XXX: AF_KEY */ V_key_cb.key_count++; V_key_cb.any_count++; soisconnected(so); so->so_options |= SO_USELOOPBACK; return 0; } /* * key_bind() * derived from net/rtsock.c:rts_bind() */ static int key_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { return EINVAL; } /* * key_close() * derived from net/rtsock.c:rts_close(). */ static void key_close(struct socket *so) { raw_usrreqs.pru_close(so); } /* * key_connect() * derived from net/rtsock.c:rts_connect() */ static int key_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { return EINVAL; } /* * key_detach() * derived from net/rtsock.c:rts_detach() */ static void key_detach(struct socket *so) { INIT_VNET_IPSEC(curvnet); struct keycb *kp = (struct keycb *)sotorawcb(so); KASSERT(kp != NULL, ("key_detach: kp == NULL")); if (kp->kp_raw.rcb_proto.sp_protocol == PF_KEY) /* XXX: AF_KEY */ V_key_cb.key_count--; V_key_cb.any_count--; key_freereg(so); raw_usrreqs.pru_detach(so); } /* * key_disconnect() * derived from net/rtsock.c:key_disconnect() */ static int key_disconnect(struct socket *so) { return(raw_usrreqs.pru_disconnect(so)); } /* * key_peeraddr() * derived from net/rtsock.c:rts_peeraddr() */ static int key_peeraddr(struct socket *so, struct sockaddr **nam) { return(raw_usrreqs.pru_peeraddr(so, nam)); } /* * key_send() * derived from net/rtsock.c:rts_send() */ static int key_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { return(raw_usrreqs.pru_send(so, flags, m, nam, control, td)); } /* * key_shutdown() * derived from net/rtsock.c:rts_shutdown() */ static int key_shutdown(struct socket *so) { return(raw_usrreqs.pru_shutdown(so)); } /* * key_sockaddr() * derived from net/rtsock.c:rts_sockaddr() */ static int key_sockaddr(struct socket *so, struct sockaddr **nam) { return(raw_usrreqs.pru_sockaddr(so, nam)); } struct pr_usrreqs key_usrreqs = { .pru_abort = key_abort, .pru_attach = key_attach, .pru_bind = key_bind, .pru_connect = key_connect, .pru_detach = key_detach, .pru_disconnect = key_disconnect, .pru_peeraddr = key_peeraddr, .pru_send = key_send, .pru_shutdown = key_shutdown, .pru_sockaddr = key_sockaddr, .pru_close = key_close, }; /* sysctl */ SYSCTL_NODE(_net, PF_KEY, key, CTLFLAG_RW, 0, "Key Family"); /* * Definitions of protocols supported in the KEY domain. */ extern struct domain keydomain; struct protosw keysw[] = { { .pr_type = SOCK_RAW, .pr_domain = &keydomain, .pr_protocol = PF_KEY_V2, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_output = key_output, .pr_ctlinput = raw_ctlinput, .pr_init = raw_init, .pr_usrreqs = &key_usrreqs } }; static void key_init0(void) { INIT_VNET_IPSEC(curvnet); + bzero((caddr_t)&V_key_cb, sizeof(V_key_cb)); + ipsec_init(); key_init(); } struct domain keydomain = { .dom_family = PF_KEY, .dom_name = "key", .dom_init = key_init0, .dom_protosw = keysw, .dom_protoswNPROTOSW = &keysw[sizeof(keysw)/sizeof(keysw[0])] }; DOMAIN_SET(key); Index: head/sys/netipsec/xform_ah.c =================================================================== --- head/sys/netipsec/xform_ah.c (revision 185087) +++ head/sys/netipsec/xform_ah.c (revision 185088) @@ -1,1222 +1,1228 @@ /* $FreeBSD$ */ /* $OpenBSD: ip_ah.c,v 1.63 2001/06/26 06:18:58 angelos Exp $ */ /*- * The authors of this code are John Ioannidis (ji@tla.org), * Angelos D. Keromytis (kermit@csd.uch.gr) and * Niels Provos (provos@physnet.uni-hamburg.de). * * The original version of this code was written by John Ioannidis * for BSD/OS in Athens, Greece, in November 1995. * * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996, * by Angelos D. Keromytis. * * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis * and Niels Provos. * * Additional features in 1999 by Angelos D. Keromytis and Niklas Hallqvist. * * Copyright (c) 1995, 1996, 1997, 1998, 1999 by John Ioannidis, * Angelos D. Keromytis and Niels Provos. * Copyright (c) 1999 Niklas Hallqvist. * Copyright (c) 2001 Angelos D. Keromytis. * * Permission to use, copy, and modify this software with or without fee * is hereby granted, provided that this entire notice is included in * all copies of any software which is or includes a copy or * modification of this software. * You may use this code under the GNU public license if you so wish. Please * contribute changes back to the authors under this freer than GPL license * so that we may further the use of strong encryption without limitations to * all. * * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR * PURPOSE. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #include #include /* * Return header size in bytes. The old protocol did not support * the replay counter; the new protocol always includes the counter. */ #define HDRSIZE(sav) \ (((sav)->flags & SADB_X_EXT_OLD) ? \ sizeof (struct ah) : sizeof (struct ah) + sizeof (u_int32_t)) /* * Return authenticator size in bytes. The old protocol is known * to use a fixed 16-byte authenticator. The new algorithm use 12-byte * authenticator. */ #define AUTHSIZE(sav) \ ((sav->flags & SADB_X_EXT_OLD) ? 16 : AH_HMAC_HASHLEN) -int ah_enable = 1; /* control flow of packets with AH */ -int ah_cleartos = 1; /* clear ip_tos when doing AH calc */ +#ifdef VIMAGE_GLOBALS +int ah_enable; +int ah_cleartos; struct ahstat ahstat; +#endif SYSCTL_DECL(_net_inet_ah); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ah, OID_AUTO, ah_enable, CTLFLAG_RW, ah_enable, 0, ""); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ah, OID_AUTO, ah_cleartos, CTLFLAG_RW, ah_cleartos, 0, ""); SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ah, IPSECCTL_STATS, stats, CTLFLAG_RD, ahstat, ahstat, ""); static unsigned char ipseczeroes[256]; /* larger than an ip6 extension hdr */ static int ah_input_cb(struct cryptop*); static int ah_output_cb(struct cryptop*); /* * NB: this is public for use by the PF_KEY support. */ struct auth_hash * ah_algorithm_lookup(int alg) { if (alg > SADB_AALG_MAX) return NULL; switch (alg) { case SADB_X_AALG_NULL: return &auth_hash_null; case SADB_AALG_MD5HMAC: return &auth_hash_hmac_md5; case SADB_AALG_SHA1HMAC: return &auth_hash_hmac_sha1; case SADB_X_AALG_RIPEMD160HMAC: return &auth_hash_hmac_ripemd_160; case SADB_X_AALG_MD5: return &auth_hash_key_md5; case SADB_X_AALG_SHA: return &auth_hash_key_sha1; case SADB_X_AALG_SHA2_256: return &auth_hash_hmac_sha2_256; case SADB_X_AALG_SHA2_384: return &auth_hash_hmac_sha2_384; case SADB_X_AALG_SHA2_512: return &auth_hash_hmac_sha2_512; } return NULL; } size_t ah_hdrsiz(struct secasvar *sav) { size_t size; if (sav != NULL) { int authsize; IPSEC_ASSERT(sav->tdb_authalgxform != NULL, ("null xform")); /*XXX not right for null algorithm--does it matter??*/ authsize = AUTHSIZE(sav); size = roundup(authsize, sizeof (u_int32_t)) + HDRSIZE(sav); } else { /* default guess */ size = sizeof (struct ah) + sizeof (u_int32_t) + 16; } return size; } /* * NB: public for use by esp_init. */ int ah_init0(struct secasvar *sav, struct xformsw *xsp, struct cryptoini *cria) { INIT_VNET_IPSEC(curvnet); struct auth_hash *thash; int keylen; thash = ah_algorithm_lookup(sav->alg_auth); if (thash == NULL) { DPRINTF(("%s: unsupported authentication algorithm %u\n", __func__, sav->alg_auth)); return EINVAL; } /* * Verify the replay state block allocation is consistent with * the protocol type. We check here so we can make assumptions * later during protocol processing. */ /* NB: replay state is setup elsewhere (sigh) */ if (((sav->flags&SADB_X_EXT_OLD) == 0) ^ (sav->replay != NULL)) { DPRINTF(("%s: replay state block inconsistency, " "%s algorithm %s replay state\n", __func__, (sav->flags & SADB_X_EXT_OLD) ? "old" : "new", sav->replay == NULL ? "without" : "with")); return EINVAL; } if (sav->key_auth == NULL) { DPRINTF(("%s: no authentication key for %s algorithm\n", __func__, thash->name)); return EINVAL; } keylen = _KEYLEN(sav->key_auth); if (keylen != thash->keysize && thash->keysize != 0) { DPRINTF(("%s: invalid keylength %d, algorithm %s requires " "keysize %d\n", __func__, keylen, thash->name, thash->keysize)); return EINVAL; } sav->tdb_xform = xsp; sav->tdb_authalgxform = thash; /* Initialize crypto session. */ bzero(cria, sizeof (*cria)); cria->cri_alg = sav->tdb_authalgxform->type; cria->cri_klen = _KEYBITS(sav->key_auth); cria->cri_key = sav->key_auth->key_data; cria->cri_mlen = AUTHSIZE(sav); return 0; } /* * ah_init() is called when an SPI is being set up. */ static int ah_init(struct secasvar *sav, struct xformsw *xsp) { INIT_VNET_IPSEC(curvnet); struct cryptoini cria; int error; error = ah_init0(sav, xsp, &cria); return error ? error : crypto_newsession(&sav->tdb_cryptoid, &cria, V_crypto_support); } /* * Paranoia. * * NB: public for use by esp_zeroize (XXX). */ int ah_zeroize(struct secasvar *sav) { int err; if (sav->key_auth) bzero(sav->key_auth->key_data, _KEYLEN(sav->key_auth)); err = crypto_freesession(sav->tdb_cryptoid); sav->tdb_cryptoid = 0; sav->tdb_authalgxform = NULL; sav->tdb_xform = NULL; return err; } /* * Massage IPv4/IPv6 headers for AH processing. */ static int ah_massage_headers(struct mbuf **m0, int proto, int skip, int alg, int out) { INIT_VNET_IPSEC(curvnet); struct mbuf *m = *m0; unsigned char *ptr; int off, count; #ifdef INET struct ip *ip; #endif /* INET */ #ifdef INET6 struct ip6_ext *ip6e; struct ip6_hdr ip6; int alloc, len, ad; #endif /* INET6 */ switch (proto) { #ifdef INET case AF_INET: /* * This is the least painful way of dealing with IPv4 header * and option processing -- just make sure they're in * contiguous memory. */ *m0 = m = m_pullup(m, skip); if (m == NULL) { DPRINTF(("%s: m_pullup failed\n", __func__)); return ENOBUFS; } /* Fix the IP header */ ip = mtod(m, struct ip *); if (V_ah_cleartos) ip->ip_tos = 0; ip->ip_ttl = 0; ip->ip_sum = 0; /* * On input, fix ip_len which has been byte-swapped * at ip_input(). */ if (!out) { ip->ip_len = htons(ip->ip_len + skip); if (alg == CRYPTO_MD5_KPDK || alg == CRYPTO_SHA1_KPDK) ip->ip_off = htons(ip->ip_off & IP_DF); else ip->ip_off = 0; } else { if (alg == CRYPTO_MD5_KPDK || alg == CRYPTO_SHA1_KPDK) ip->ip_off = htons(ntohs(ip->ip_off) & IP_DF); else ip->ip_off = 0; } ptr = mtod(m, unsigned char *) + sizeof(struct ip); /* IPv4 option processing */ for (off = sizeof(struct ip); off < skip;) { if (ptr[off] == IPOPT_EOL || ptr[off] == IPOPT_NOP || off + 1 < skip) ; else { DPRINTF(("%s: illegal IPv4 option length for " "option %d\n", __func__, ptr[off])); m_freem(m); return EINVAL; } switch (ptr[off]) { case IPOPT_EOL: off = skip; /* End the loop. */ break; case IPOPT_NOP: off++; break; case IPOPT_SECURITY: /* 0x82 */ case 0x85: /* Extended security. */ case 0x86: /* Commercial security. */ case 0x94: /* Router alert */ case 0x95: /* RFC1770 */ /* Sanity check for option length. */ if (ptr[off + 1] < 2) { DPRINTF(("%s: illegal IPv4 option " "length for option %d\n", __func__, ptr[off])); m_freem(m); return EINVAL; } off += ptr[off + 1]; break; case IPOPT_LSRR: case IPOPT_SSRR: /* Sanity check for option length. */ if (ptr[off + 1] < 2) { DPRINTF(("%s: illegal IPv4 option " "length for option %d\n", __func__, ptr[off])); m_freem(m); return EINVAL; } /* * On output, if we have either of the * source routing options, we should * swap the destination address of the * IP header with the last address * specified in the option, as that is * what the destination's IP header * will look like. */ if (out) bcopy(ptr + off + ptr[off + 1] - sizeof(struct in_addr), &(ip->ip_dst), sizeof(struct in_addr)); /* Fall through */ default: /* Sanity check for option length. */ if (ptr[off + 1] < 2) { DPRINTF(("%s: illegal IPv4 option " "length for option %d\n", __func__, ptr[off])); m_freem(m); return EINVAL; } /* Zeroize all other options. */ count = ptr[off + 1]; bcopy(ipseczeroes, ptr, count); off += count; break; } /* Sanity check. */ if (off > skip) { DPRINTF(("%s: malformed IPv4 options header\n", __func__)); m_freem(m); return EINVAL; } } break; #endif /* INET */ #ifdef INET6 case AF_INET6: /* Ugly... */ /* Copy and "cook" the IPv6 header. */ m_copydata(m, 0, sizeof(ip6), (caddr_t) &ip6); /* We don't do IPv6 Jumbograms. */ if (ip6.ip6_plen == 0) { DPRINTF(("%s: unsupported IPv6 jumbogram\n", __func__)); m_freem(m); return EMSGSIZE; } ip6.ip6_flow = 0; ip6.ip6_hlim = 0; ip6.ip6_vfc &= ~IPV6_VERSION_MASK; ip6.ip6_vfc |= IPV6_VERSION; /* Scoped address handling. */ if (IN6_IS_SCOPE_LINKLOCAL(&ip6.ip6_src)) ip6.ip6_src.s6_addr16[1] = 0; if (IN6_IS_SCOPE_LINKLOCAL(&ip6.ip6_dst)) ip6.ip6_dst.s6_addr16[1] = 0; /* Done with IPv6 header. */ m_copyback(m, 0, sizeof(struct ip6_hdr), (caddr_t) &ip6); /* Let's deal with the remaining headers (if any). */ if (skip - sizeof(struct ip6_hdr) > 0) { if (m->m_len <= skip) { ptr = (unsigned char *) malloc( skip - sizeof(struct ip6_hdr), M_XDATA, M_NOWAIT); if (ptr == NULL) { DPRINTF(("%s: failed to allocate memory" "for IPv6 headers\n",__func__)); m_freem(m); return ENOBUFS; } /* * Copy all the protocol headers after * the IPv6 header. */ m_copydata(m, sizeof(struct ip6_hdr), skip - sizeof(struct ip6_hdr), ptr); alloc = 1; } else { /* No need to allocate memory. */ ptr = mtod(m, unsigned char *) + sizeof(struct ip6_hdr); alloc = 0; } } else break; off = ip6.ip6_nxt & 0xff; /* Next header type. */ for (len = 0; len < skip - sizeof(struct ip6_hdr);) switch (off) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: ip6e = (struct ip6_ext *) (ptr + len); /* * Process the mutable/immutable * options -- borrows heavily from the * KAME code. */ for (count = len + sizeof(struct ip6_ext); count < len + ((ip6e->ip6e_len + 1) << 3);) { if (ptr[count] == IP6OPT_PAD1) { count++; continue; /* Skip padding. */ } /* Sanity check. */ if (count > len + ((ip6e->ip6e_len + 1) << 3)) { m_freem(m); /* Free, if we allocated. */ if (alloc) free(ptr, M_XDATA); return EINVAL; } ad = ptr[count + 1]; /* If mutable option, zeroize. */ if (ptr[count] & IP6OPT_MUTABLE) bcopy(ipseczeroes, ptr + count, ptr[count + 1]); count += ad; /* Sanity check. */ if (count > skip - sizeof(struct ip6_hdr)) { m_freem(m); /* Free, if we allocated. */ if (alloc) free(ptr, M_XDATA); return EINVAL; } } /* Advance. */ len += ((ip6e->ip6e_len + 1) << 3); off = ip6e->ip6e_nxt; break; case IPPROTO_ROUTING: /* * Always include routing headers in * computation. */ ip6e = (struct ip6_ext *) (ptr + len); len += ((ip6e->ip6e_len + 1) << 3); off = ip6e->ip6e_nxt; break; default: DPRINTF(("%s: unexpected IPv6 header type %d", __func__, off)); if (alloc) free(ptr, M_XDATA); m_freem(m); return EINVAL; } /* Copyback and free, if we allocated. */ if (alloc) { m_copyback(m, sizeof(struct ip6_hdr), skip - sizeof(struct ip6_hdr), ptr); free(ptr, M_XDATA); } break; #endif /* INET6 */ } return 0; } /* * ah_input() gets called to verify that an input packet * passes authentication. */ static int ah_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) { INIT_VNET_IPSEC(curvnet); struct auth_hash *ahx; struct tdb_ident *tdbi; struct tdb_crypto *tc; struct m_tag *mtag; struct newah *ah; int hl, rplen, authsize; struct cryptodesc *crda; struct cryptop *crp; IPSEC_ASSERT(sav != NULL, ("null SA")); IPSEC_ASSERT(sav->key_auth != NULL, ("null authentication key")); IPSEC_ASSERT(sav->tdb_authalgxform != NULL, ("null authentication xform")); /* Figure out header size. */ rplen = HDRSIZE(sav); /* XXX don't pullup, just copy header */ IP6_EXTHDR_GET(ah, struct newah *, m, skip, rplen); if (ah == NULL) { DPRINTF(("ah_input: cannot pullup header\n")); V_ahstat.ahs_hdrops++; /*XXX*/ m_freem(m); return ENOBUFS; } /* Check replay window, if applicable. */ if (sav->replay && !ipsec_chkreplay(ntohl(ah->ah_seq), sav)) { V_ahstat.ahs_replay++; DPRINTF(("%s: packet replay failure: %s\n", __func__, ipsec_logsastr(sav))); m_freem(m); return ENOBUFS; } /* Verify AH header length. */ hl = ah->ah_len * sizeof (u_int32_t); ahx = sav->tdb_authalgxform; authsize = AUTHSIZE(sav); if (hl != authsize + rplen - sizeof (struct ah)) { DPRINTF(("%s: bad authenticator length %u (expecting %lu)" " for packet in SA %s/%08lx\n", __func__, hl, (u_long) (authsize + rplen - sizeof (struct ah)), ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); V_ahstat.ahs_badauthl++; m_freem(m); return EACCES; } V_ahstat.ahs_ibytes += m->m_pkthdr.len - skip - hl; /* Get crypto descriptors. */ crp = crypto_getreq(1); if (crp == NULL) { DPRINTF(("%s: failed to acquire crypto descriptor\n",__func__)); V_ahstat.ahs_crypto++; m_freem(m); return ENOBUFS; } crda = crp->crp_desc; IPSEC_ASSERT(crda != NULL, ("null crypto descriptor")); crda->crd_skip = 0; crda->crd_len = m->m_pkthdr.len; crda->crd_inject = skip + rplen; /* Authentication operation. */ crda->crd_alg = ahx->type; crda->crd_klen = _KEYBITS(sav->key_auth); crda->crd_key = sav->key_auth->key_data; /* Find out if we've already done crypto. */ for (mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, NULL); mtag != NULL; mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, mtag)) { tdbi = (struct tdb_ident *) (mtag + 1); if (tdbi->proto == sav->sah->saidx.proto && tdbi->spi == sav->spi && !bcmp(&tdbi->dst, &sav->sah->saidx.dst, sizeof (union sockaddr_union))) break; } /* Allocate IPsec-specific opaque crypto info. */ if (mtag == NULL) { tc = (struct tdb_crypto *) malloc(sizeof (struct tdb_crypto) + skip + rplen + authsize, M_XDATA, M_NOWAIT|M_ZERO); } else { /* Hash verification has already been done successfully. */ tc = (struct tdb_crypto *) malloc(sizeof (struct tdb_crypto), M_XDATA, M_NOWAIT|M_ZERO); } if (tc == NULL) { DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); V_ahstat.ahs_crypto++; crypto_freereq(crp); m_freem(m); return ENOBUFS; } /* Only save information if crypto processing is needed. */ if (mtag == NULL) { int error; /* * Save the authenticator, the skipped portion of the packet, * and the AH header. */ m_copydata(m, 0, skip + rplen + authsize, (caddr_t)(tc+1)); /* Zeroize the authenticator on the packet. */ m_copyback(m, skip + rplen, authsize, ipseczeroes); /* "Massage" the packet headers for crypto processing. */ error = ah_massage_headers(&m, sav->sah->saidx.dst.sa.sa_family, skip, ahx->type, 0); if (error != 0) { /* NB: mbuf is free'd by ah_massage_headers */ V_ahstat.ahs_hdrops++; free(tc, M_XDATA); crypto_freereq(crp); return error; } } /* Crypto operation descriptor. */ crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */ crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = ah_input_cb; crp->crp_sid = sav->tdb_cryptoid; crp->crp_opaque = (caddr_t) tc; /* These are passed as-is to the callback. */ tc->tc_spi = sav->spi; tc->tc_dst = sav->sah->saidx.dst; tc->tc_proto = sav->sah->saidx.proto; tc->tc_nxt = ah->ah_nxt; tc->tc_protoff = protoff; tc->tc_skip = skip; tc->tc_ptr = (caddr_t) mtag; /* Save the mtag we've identified. */ if (mtag == NULL) return crypto_dispatch(crp); else return ah_input_cb(crp); } #ifdef INET6 #define IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag) do { \ if (saidx->dst.sa.sa_family == AF_INET6) { \ error = ipsec6_common_input_cb(m, sav, skip, protoff, mtag); \ } else { \ error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag); \ } \ } while (0) #else #define IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag) \ (error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag)) #endif /* * AH input callback from the crypto driver. */ static int ah_input_cb(struct cryptop *crp) { INIT_VNET_IPSEC(curvnet); int rplen, error, skip, protoff; unsigned char calc[AH_ALEN_MAX]; struct mbuf *m; struct cryptodesc *crd; struct auth_hash *ahx; struct tdb_crypto *tc; struct m_tag *mtag; struct secasvar *sav; struct secasindex *saidx; u_int8_t nxt; caddr_t ptr; int authsize; crd = crp->crp_desc; tc = (struct tdb_crypto *) crp->crp_opaque; IPSEC_ASSERT(tc != NULL, ("null opaque crypto data area!")); skip = tc->tc_skip; nxt = tc->tc_nxt; protoff = tc->tc_protoff; mtag = (struct m_tag *) tc->tc_ptr; m = (struct mbuf *) crp->crp_buf; sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi); if (sav == NULL) { V_ahstat.ahs_notdb++; DPRINTF(("%s: SA expired while in crypto\n", __func__)); error = ENOBUFS; /*XXX*/ goto bad; } saidx = &sav->sah->saidx; IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET || saidx->dst.sa.sa_family == AF_INET6, ("unexpected protocol family %u", saidx->dst.sa.sa_family)); ahx = (struct auth_hash *) sav->tdb_authalgxform; /* Check for crypto errors. */ if (crp->crp_etype) { if (sav->tdb_cryptoid != 0) sav->tdb_cryptoid = crp->crp_sid; if (crp->crp_etype == EAGAIN) { error = crypto_dispatch(crp); return error; } V_ahstat.ahs_noxform++; DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } else { V_ahstat.ahs_hist[sav->alg_auth]++; crypto_freereq(crp); /* No longer needed. */ crp = NULL; } /* Shouldn't happen... */ if (m == NULL) { V_ahstat.ahs_crypto++; DPRINTF(("%s: bogus returned buffer from crypto\n", __func__)); error = EINVAL; goto bad; } /* Figure out header size. */ rplen = HDRSIZE(sav); authsize = AUTHSIZE(sav); /* Copy authenticator off the packet. */ m_copydata(m, skip + rplen, authsize, calc); /* * If we have an mtag, we don't need to verify the authenticator -- * it has been verified by an IPsec-aware NIC. */ if (mtag == NULL) { ptr = (caddr_t) (tc + 1); /* Verify authenticator. */ if (bcmp(ptr + skip + rplen, calc, authsize)) { DPRINTF(("%s: authentication hash mismatch for packet " "in SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); V_ahstat.ahs_badauth++; error = EACCES; goto bad; } /* Fix the Next Protocol field. */ ((u_int8_t *) ptr)[protoff] = nxt; /* Copyback the saved (uncooked) network headers. */ m_copyback(m, 0, skip, ptr); } else { /* Fix the Next Protocol field. */ m_copyback(m, protoff, sizeof(u_int8_t), &nxt); } free(tc, M_XDATA), tc = NULL; /* No longer needed */ /* * Header is now authenticated. */ m->m_flags |= M_AUTHIPHDR|M_AUTHIPDGM; /* * Update replay sequence number, if appropriate. */ if (sav->replay) { u_int32_t seq; m_copydata(m, skip + offsetof(struct newah, ah_seq), sizeof (seq), (caddr_t) &seq); if (ipsec_updatereplay(ntohl(seq), sav)) { V_ahstat.ahs_replay++; error = ENOBUFS; /*XXX as above*/ goto bad; } } /* * Remove the AH header and authenticator from the mbuf. */ error = m_striphdr(m, skip, rplen + authsize); if (error) { DPRINTF(("%s: mangled mbuf chain for SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); V_ahstat.ahs_hdrops++; goto bad; } IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag); KEY_FREESAV(&sav); return error; bad: if (sav) KEY_FREESAV(&sav); if (m != NULL) m_freem(m); if (tc != NULL) free(tc, M_XDATA); if (crp != NULL) crypto_freereq(crp); return error; } /* * AH output routine, called by ipsec[46]_process_packet(). */ static int ah_output( struct mbuf *m, struct ipsecrequest *isr, struct mbuf **mp, int skip, int protoff) { INIT_VNET_IPSEC(curvnet); struct secasvar *sav; struct auth_hash *ahx; struct cryptodesc *crda; struct tdb_crypto *tc; struct mbuf *mi; struct cryptop *crp; u_int16_t iplen; int error, rplen, authsize, maxpacketsize, roff; u_int8_t prot; struct newah *ah; sav = isr->sav; IPSEC_ASSERT(sav != NULL, ("null SA")); ahx = sav->tdb_authalgxform; IPSEC_ASSERT(ahx != NULL, ("null authentication xform")); V_ahstat.ahs_output++; /* Figure out header size. */ rplen = HDRSIZE(sav); /* Check for maximum packet size violations. */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: maxpacketsize = IP_MAXPACKET; break; #endif /* INET */ #ifdef INET6 case AF_INET6: maxpacketsize = IPV6_MAXPACKET; break; #endif /* INET6 */ default: DPRINTF(("%s: unknown/unsupported protocol family %u, " "SA %s/%08lx\n", __func__, sav->sah->saidx.dst.sa.sa_family, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); V_ahstat.ahs_nopf++; error = EPFNOSUPPORT; goto bad; } authsize = AUTHSIZE(sav); if (rplen + authsize + m->m_pkthdr.len > maxpacketsize) { DPRINTF(("%s: packet in SA %s/%08lx got too big " "(len %u, max len %u)\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi), rplen + authsize + m->m_pkthdr.len, maxpacketsize)); V_ahstat.ahs_toobig++; error = EMSGSIZE; goto bad; } /* Update the counters. */ V_ahstat.ahs_obytes += m->m_pkthdr.len - skip; m = m_unshare(m, M_NOWAIT); if (m == NULL) { DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); V_ahstat.ahs_hdrops++; error = ENOBUFS; goto bad; } /* Inject AH header. */ mi = m_makespace(m, skip, rplen + authsize, &roff); if (mi == NULL) { DPRINTF(("%s: failed to inject %u byte AH header for SA " "%s/%08lx\n", __func__, rplen + authsize, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); V_ahstat.ahs_hdrops++; /*XXX differs from openbsd */ error = ENOBUFS; goto bad; } /* * The AH header is guaranteed by m_makespace() to be in * contiguous memory, at roff bytes offset into the returned mbuf. */ ah = (struct newah *)(mtod(mi, caddr_t) + roff); /* Initialize the AH header. */ m_copydata(m, protoff, sizeof(u_int8_t), (caddr_t) &ah->ah_nxt); ah->ah_len = (rplen + authsize - sizeof(struct ah)) / sizeof(u_int32_t); ah->ah_reserve = 0; ah->ah_spi = sav->spi; /* Zeroize authenticator. */ m_copyback(m, skip + rplen, authsize, ipseczeroes); /* Insert packet replay counter, as requested. */ if (sav->replay) { if (sav->replay->count == ~0 && (sav->flags & SADB_X_EXT_CYCSEQ) == 0) { DPRINTF(("%s: replay counter wrapped for SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); V_ahstat.ahs_wrap++; error = EINVAL; goto bad; } #ifdef REGRESSION /* Emulate replay attack when ipsec_replay is TRUE. */ if (!V_ipsec_replay) #endif sav->replay->count++; ah->ah_seq = htonl(sav->replay->count); } /* Get crypto descriptors. */ crp = crypto_getreq(1); if (crp == NULL) { DPRINTF(("%s: failed to acquire crypto descriptors\n", __func__)); V_ahstat.ahs_crypto++; error = ENOBUFS; goto bad; } crda = crp->crp_desc; crda->crd_skip = 0; crda->crd_inject = skip + rplen; crda->crd_len = m->m_pkthdr.len; /* Authentication operation. */ crda->crd_alg = ahx->type; crda->crd_key = sav->key_auth->key_data; crda->crd_klen = _KEYBITS(sav->key_auth); /* Allocate IPsec-specific opaque crypto info. */ tc = (struct tdb_crypto *) malloc( sizeof(struct tdb_crypto) + skip, M_XDATA, M_NOWAIT|M_ZERO); if (tc == NULL) { crypto_freereq(crp); DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); V_ahstat.ahs_crypto++; error = ENOBUFS; goto bad; } /* Save the skipped portion of the packet. */ m_copydata(m, 0, skip, (caddr_t) (tc + 1)); /* * Fix IP header length on the header used for * authentication. We don't need to fix the original * header length as it will be fixed by our caller. */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: bcopy(((caddr_t)(tc + 1)) + offsetof(struct ip, ip_len), (caddr_t) &iplen, sizeof(u_int16_t)); iplen = htons(ntohs(iplen) + rplen + authsize); m_copyback(m, offsetof(struct ip, ip_len), sizeof(u_int16_t), (caddr_t) &iplen); break; #endif /* INET */ #ifdef INET6 case AF_INET6: bcopy(((caddr_t)(tc + 1)) + offsetof(struct ip6_hdr, ip6_plen), (caddr_t) &iplen, sizeof(u_int16_t)); iplen = htons(ntohs(iplen) + rplen + authsize); m_copyback(m, offsetof(struct ip6_hdr, ip6_plen), sizeof(u_int16_t), (caddr_t) &iplen); break; #endif /* INET6 */ } /* Fix the Next Header field in saved header. */ ((u_int8_t *) (tc + 1))[protoff] = IPPROTO_AH; /* Update the Next Protocol field in the IP header. */ prot = IPPROTO_AH; m_copyback(m, protoff, sizeof(u_int8_t), (caddr_t) &prot); /* "Massage" the packet headers for crypto processing. */ error = ah_massage_headers(&m, sav->sah->saidx.dst.sa.sa_family, skip, ahx->type, 1); if (error != 0) { m = NULL; /* mbuf was free'd by ah_massage_headers. */ free(tc, M_XDATA); crypto_freereq(crp); goto bad; } /* Crypto operation descriptor. */ crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */ crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = ah_output_cb; crp->crp_sid = sav->tdb_cryptoid; crp->crp_opaque = (caddr_t) tc; /* These are passed as-is to the callback. */ tc->tc_isr = isr; tc->tc_spi = sav->spi; tc->tc_dst = sav->sah->saidx.dst; tc->tc_proto = sav->sah->saidx.proto; tc->tc_skip = skip; tc->tc_protoff = protoff; return crypto_dispatch(crp); bad: if (m) m_freem(m); return (error); } /* * AH output callback from the crypto driver. */ static int ah_output_cb(struct cryptop *crp) { INIT_VNET_IPSEC(curvnet); int skip, protoff, error; struct tdb_crypto *tc; struct ipsecrequest *isr; struct secasvar *sav; struct mbuf *m; caddr_t ptr; int err; tc = (struct tdb_crypto *) crp->crp_opaque; IPSEC_ASSERT(tc != NULL, ("null opaque data area!")); skip = tc->tc_skip; protoff = tc->tc_protoff; ptr = (caddr_t) (tc + 1); m = (struct mbuf *) crp->crp_buf; isr = tc->tc_isr; IPSECREQUEST_LOCK(isr); sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi); if (sav == NULL) { V_ahstat.ahs_notdb++; DPRINTF(("%s: SA expired while in crypto\n", __func__)); error = ENOBUFS; /*XXX*/ goto bad; } IPSEC_ASSERT(isr->sav == sav, ("SA changed\n")); /* Check for crypto errors. */ if (crp->crp_etype) { if (sav->tdb_cryptoid != 0) sav->tdb_cryptoid = crp->crp_sid; if (crp->crp_etype == EAGAIN) { KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); error = crypto_dispatch(crp); return error; } V_ahstat.ahs_noxform++; DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { V_ahstat.ahs_crypto++; DPRINTF(("%s: bogus returned buffer from crypto\n", __func__)); error = EINVAL; goto bad; } V_ahstat.ahs_hist[sav->alg_auth]++; /* * Copy original headers (with the new protocol number) back * in place. */ m_copyback(m, 0, skip, ptr); /* No longer needed. */ free(tc, M_XDATA); crypto_freereq(crp); #ifdef REGRESSION /* Emulate man-in-the-middle attack when ipsec_integrity is TRUE. */ if (V_ipsec_integrity) { int alen; /* * Corrupt HMAC if we want to test integrity verification of * the other side. */ alen = AUTHSIZE(sav); m_copyback(m, m->m_pkthdr.len - alen, alen, ipseczeroes); } #endif /* NB: m is reclaimed by ipsec_process_done. */ err = ipsec_process_done(m, isr); KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); return err; bad: if (sav) KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); if (m) m_freem(m); free(tc, M_XDATA); crypto_freereq(crp); return error; } static struct xformsw ah_xformsw = { XF_AH, XFT_AUTH, "IPsec AH", ah_init, ah_zeroize, ah_input, ah_output, }; static void ah_attach(void) { + + V_ah_enable = 1; /* control flow of packets with AH */ + V_ah_cleartos = 1; /* clear ip_tos when doing AH calc */ + xform_register(&ah_xformsw); } SYSINIT(ah_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ah_attach, NULL); Index: head/sys/netipsec/xform_esp.c =================================================================== --- head/sys/netipsec/xform_esp.c (revision 185087) +++ head/sys/netipsec/xform_esp.c (revision 185088) @@ -1,1009 +1,1012 @@ /* $FreeBSD$ */ /* $OpenBSD: ip_esp.c,v 1.69 2001/06/26 06:18:59 angelos Exp $ */ /*- * The authors of this code are John Ioannidis (ji@tla.org), * Angelos D. Keromytis (kermit@csd.uch.gr) and * Niels Provos (provos@physnet.uni-hamburg.de). * * The original version of this code was written by John Ioannidis * for BSD/OS in Athens, Greece, in November 1995. * * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996, * by Angelos D. Keromytis. * * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis * and Niels Provos. * * Additional features in 1999 by Angelos D. Keromytis. * * Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis, * Angelos D. Keromytis and Niels Provos. * Copyright (c) 2001 Angelos D. Keromytis. * * Permission to use, copy, and modify this software with or without fee * is hereby granted, provided that this entire notice is included in * all copies of any software which is or includes a copy or * modification of this software. * You may use this code under the GNU public license if you so wish. Please * contribute changes back to the authors under this freer than GPL license * so that we may further the use of strong encryption without limitations to * all. * * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR * PURPOSE. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #include #include #include -int esp_enable = 1; +#ifdef VIMAGE_GLOBALS struct espstat espstat; +static int esp_max_ivlen; /* max iv length over all algorithms */ +int esp_enable; +#endif SYSCTL_DECL(_net_inet_esp); SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_esp, OID_AUTO, esp_enable, CTLFLAG_RW, esp_enable, 0, ""); SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_esp, IPSECCTL_STATS, stats, CTLFLAG_RD, espstat, espstat, ""); -static int esp_max_ivlen; /* max iv length over all algorithms */ - static int esp_input_cb(struct cryptop *op); static int esp_output_cb(struct cryptop *crp); /* * NB: this is public for use by the PF_KEY support. * NB: if you add support here; be sure to add code to esp_attach below! */ struct enc_xform * esp_algorithm_lookup(int alg) { if (alg >= ESP_ALG_MAX) return NULL; switch (alg) { case SADB_EALG_DESCBC: return &enc_xform_des; case SADB_EALG_3DESCBC: return &enc_xform_3des; case SADB_X_EALG_AES: return &enc_xform_rijndael128; case SADB_X_EALG_BLOWFISHCBC: return &enc_xform_blf; case SADB_X_EALG_CAST128CBC: return &enc_xform_cast5; case SADB_X_EALG_SKIPJACK: return &enc_xform_skipjack; case SADB_EALG_NULL: return &enc_xform_null; case SADB_X_EALG_CAMELLIACBC: return &enc_xform_camellia; } return NULL; } size_t esp_hdrsiz(struct secasvar *sav) { INIT_VNET_IPSEC(curvnet); size_t size; if (sav != NULL) { /*XXX not right for null algorithm--does it matter??*/ IPSEC_ASSERT(sav->tdb_encalgxform != NULL, ("SA with null xform")); if (sav->flags & SADB_X_EXT_OLD) size = sizeof (struct esp); else size = sizeof (struct newesp); size += sav->tdb_encalgxform->blocksize + 9; /*XXX need alg check???*/ if (sav->tdb_authalgxform != NULL && sav->replay) size += ah_hdrsiz(sav); } else { /* * base header size * + max iv length for CBC mode * + max pad length * + sizeof (pad length field) * + sizeof (next header field) * + max icv supported. */ size = sizeof (struct newesp) + V_esp_max_ivlen + 9 + 16; } return size; } /* * esp_init() is called when an SPI is being set up. */ static int esp_init(struct secasvar *sav, struct xformsw *xsp) { INIT_VNET_IPSEC(curvnet); struct enc_xform *txform; struct cryptoini cria, crie; int keylen; int error; txform = esp_algorithm_lookup(sav->alg_enc); if (txform == NULL) { DPRINTF(("%s: unsupported encryption algorithm %d\n", __func__, sav->alg_enc)); return EINVAL; } if (sav->key_enc == NULL) { DPRINTF(("%s: no encoding key for %s algorithm\n", __func__, txform->name)); return EINVAL; } if ((sav->flags&(SADB_X_EXT_OLD|SADB_X_EXT_IV4B)) == SADB_X_EXT_IV4B) { DPRINTF(("%s: 4-byte IV not supported with protocol\n", __func__)); return EINVAL; } keylen = _KEYLEN(sav->key_enc); if (txform->minkey > keylen || keylen > txform->maxkey) { DPRINTF(("%s: invalid key length %u, must be in the range " "[%u..%u] for algorithm %s\n", __func__, keylen, txform->minkey, txform->maxkey, txform->name)); return EINVAL; } /* * NB: The null xform needs a non-zero blocksize to keep the * crypto code happy but if we use it to set ivlen then * the ESP header will be processed incorrectly. The * compromise is to force it to zero here. */ sav->ivlen = (txform == &enc_xform_null ? 0 : txform->blocksize); sav->iv = (caddr_t) malloc(sav->ivlen, M_XDATA, M_WAITOK); if (sav->iv == NULL) { DPRINTF(("%s: no memory for IV\n", __func__)); return EINVAL; } key_randomfill(sav->iv, sav->ivlen); /*XXX*/ /* * Setup AH-related state. */ if (sav->alg_auth != 0) { error = ah_init0(sav, xsp, &cria); if (error) return error; } /* NB: override anything set in ah_init0 */ sav->tdb_xform = xsp; sav->tdb_encalgxform = txform; /* Initialize crypto session. */ bzero(&crie, sizeof (crie)); crie.cri_alg = sav->tdb_encalgxform->type; crie.cri_klen = _KEYBITS(sav->key_enc); crie.cri_key = sav->key_enc->key_data; /* XXX Rounds ? */ if (sav->tdb_authalgxform && sav->tdb_encalgxform) { /* init both auth & enc */ crie.cri_next = &cria; error = crypto_newsession(&sav->tdb_cryptoid, &crie, V_crypto_support); } else if (sav->tdb_encalgxform) { error = crypto_newsession(&sav->tdb_cryptoid, &crie, V_crypto_support); } else if (sav->tdb_authalgxform) { error = crypto_newsession(&sav->tdb_cryptoid, &cria, V_crypto_support); } else { /* XXX cannot happen? */ DPRINTF(("%s: no encoding OR authentication xform!\n", __func__)); error = EINVAL; } return error; } /* * Paranoia. */ static int esp_zeroize(struct secasvar *sav) { /* NB: ah_zerorize free's the crypto session state */ int error = ah_zeroize(sav); if (sav->key_enc) bzero(sav->key_enc->key_data, _KEYLEN(sav->key_enc)); if (sav->iv) { free(sav->iv, M_XDATA); sav->iv = NULL; } sav->tdb_encalgxform = NULL; sav->tdb_xform = NULL; return error; } /* * ESP input processing, called (eventually) through the protocol switch. */ static int esp_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) { INIT_VNET_IPSEC(curvnet); struct auth_hash *esph; struct enc_xform *espx; struct tdb_ident *tdbi; struct tdb_crypto *tc; int plen, alen, hlen; struct m_tag *mtag; struct newesp *esp; struct cryptodesc *crde; struct cryptop *crp; IPSEC_ASSERT(sav != NULL, ("null SA")); IPSEC_ASSERT(sav->tdb_encalgxform != NULL, ("null encoding xform")); IPSEC_ASSERT((skip&3) == 0 && (m->m_pkthdr.len&3) == 0, ("misaligned packet, skip %u pkt len %u", skip, m->m_pkthdr.len)); /* XXX don't pullup, just copy header */ IP6_EXTHDR_GET(esp, struct newesp *, m, skip, sizeof (struct newesp)); esph = sav->tdb_authalgxform; espx = sav->tdb_encalgxform; /* Determine the ESP header length */ if (sav->flags & SADB_X_EXT_OLD) hlen = sizeof (struct esp) + sav->ivlen; else hlen = sizeof (struct newesp) + sav->ivlen; /* Authenticator hash size */ alen = esph ? AH_HMAC_HASHLEN : 0; /* * Verify payload length is multiple of encryption algorithm * block size. * * NB: This works for the null algorithm because the blocksize * is 4 and all packets must be 4-byte aligned regardless * of the algorithm. */ plen = m->m_pkthdr.len - (skip + hlen + alen); if ((plen & (espx->blocksize - 1)) || (plen <= 0)) { DPRINTF(("%s: payload of %d octets not a multiple of %d octets," " SA %s/%08lx\n", __func__, plen, espx->blocksize, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); V_espstat.esps_badilen++; m_freem(m); return EINVAL; } /* * Check sequence number. */ if (esph && sav->replay && !ipsec_chkreplay(ntohl(esp->esp_seq), sav)) { DPRINTF(("%s: packet replay check for %s\n", __func__, ipsec_logsastr(sav))); /*XXX*/ V_espstat.esps_replay++; m_freem(m); return ENOBUFS; /*XXX*/ } /* Update the counters */ V_espstat.esps_ibytes += m->m_pkthdr.len - (skip + hlen + alen); /* Find out if we've already done crypto */ for (mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, NULL); mtag != NULL; mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, mtag)) { tdbi = (struct tdb_ident *) (mtag + 1); if (tdbi->proto == sav->sah->saidx.proto && tdbi->spi == sav->spi && !bcmp(&tdbi->dst, &sav->sah->saidx.dst, sizeof(union sockaddr_union))) break; } /* Get crypto descriptors */ crp = crypto_getreq(esph && espx ? 2 : 1); if (crp == NULL) { DPRINTF(("%s: failed to acquire crypto descriptors\n", __func__)); V_espstat.esps_crypto++; m_freem(m); return ENOBUFS; } /* Get IPsec-specific opaque pointer */ if (esph == NULL || mtag != NULL) tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto), M_XDATA, M_NOWAIT|M_ZERO); else tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto) + alen, M_XDATA, M_NOWAIT|M_ZERO); if (tc == NULL) { crypto_freereq(crp); DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); V_espstat.esps_crypto++; m_freem(m); return ENOBUFS; } tc->tc_ptr = (caddr_t) mtag; if (esph) { struct cryptodesc *crda = crp->crp_desc; IPSEC_ASSERT(crda != NULL, ("null ah crypto descriptor")); /* Authentication descriptor */ crda->crd_skip = skip; crda->crd_len = m->m_pkthdr.len - (skip + alen); crda->crd_inject = m->m_pkthdr.len - alen; crda->crd_alg = esph->type; crda->crd_key = sav->key_auth->key_data; crda->crd_klen = _KEYBITS(sav->key_auth); /* Copy the authenticator */ if (mtag == NULL) m_copydata(m, m->m_pkthdr.len - alen, alen, (caddr_t) (tc + 1)); /* Chain authentication request */ crde = crda->crd_next; } else { crde = crp->crp_desc; } /* Crypto operation descriptor */ crp->crp_ilen = m->m_pkthdr.len; /* Total input length */ crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = esp_input_cb; crp->crp_sid = sav->tdb_cryptoid; crp->crp_opaque = (caddr_t) tc; /* These are passed as-is to the callback */ tc->tc_spi = sav->spi; tc->tc_dst = sav->sah->saidx.dst; tc->tc_proto = sav->sah->saidx.proto; tc->tc_protoff = protoff; tc->tc_skip = skip; /* Decryption descriptor */ if (espx) { IPSEC_ASSERT(crde != NULL, ("null esp crypto descriptor")); crde->crd_skip = skip + hlen; crde->crd_len = m->m_pkthdr.len - (skip + hlen + alen); crde->crd_inject = skip + hlen - sav->ivlen; crde->crd_alg = espx->type; crde->crd_key = sav->key_enc->key_data; crde->crd_klen = _KEYBITS(sav->key_enc); /* XXX Rounds ? */ } if (mtag == NULL) return crypto_dispatch(crp); else return esp_input_cb(crp); } #ifdef INET6 #define IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag) do { \ if (saidx->dst.sa.sa_family == AF_INET6) { \ error = ipsec6_common_input_cb(m, sav, skip, protoff, mtag); \ } else { \ error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag); \ } \ } while (0) #else #define IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag) \ (error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag)) #endif /* * ESP input callback from the crypto driver. */ static int esp_input_cb(struct cryptop *crp) { INIT_VNET_IPSEC(curvnet); u_int8_t lastthree[3], aalg[AH_HMAC_HASHLEN]; int hlen, skip, protoff, error; struct mbuf *m; struct cryptodesc *crd; struct auth_hash *esph; struct enc_xform *espx; struct tdb_crypto *tc; struct m_tag *mtag; struct secasvar *sav; struct secasindex *saidx; caddr_t ptr; crd = crp->crp_desc; IPSEC_ASSERT(crd != NULL, ("null crypto descriptor!")); tc = (struct tdb_crypto *) crp->crp_opaque; IPSEC_ASSERT(tc != NULL, ("null opaque crypto data area!")); skip = tc->tc_skip; protoff = tc->tc_protoff; mtag = (struct m_tag *) tc->tc_ptr; m = (struct mbuf *) crp->crp_buf; sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi); if (sav == NULL) { V_espstat.esps_notdb++; DPRINTF(("%s: SA gone during crypto (SA %s/%08lx proto %u)\n", __func__, ipsec_address(&tc->tc_dst), (u_long) ntohl(tc->tc_spi), tc->tc_proto)); error = ENOBUFS; /*XXX*/ goto bad; } saidx = &sav->sah->saidx; IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET || saidx->dst.sa.sa_family == AF_INET6, ("unexpected protocol family %u", saidx->dst.sa.sa_family)); esph = sav->tdb_authalgxform; espx = sav->tdb_encalgxform; /* Check for crypto errors */ if (crp->crp_etype) { /* Reset the session ID */ if (sav->tdb_cryptoid != 0) sav->tdb_cryptoid = crp->crp_sid; if (crp->crp_etype == EAGAIN) { KEY_FREESAV(&sav); error = crypto_dispatch(crp); return error; } V_espstat.esps_noxform++; DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { V_espstat.esps_crypto++; DPRINTF(("%s: bogus returned buffer from crypto\n", __func__)); error = EINVAL; goto bad; } V_espstat.esps_hist[sav->alg_enc]++; /* If authentication was performed, check now. */ if (esph != NULL) { /* * If we have a tag, it means an IPsec-aware NIC did * the verification for us. Otherwise we need to * check the authentication calculation. */ V_ahstat.ahs_hist[sav->alg_auth]++; if (mtag == NULL) { /* Copy the authenticator from the packet */ m_copydata(m, m->m_pkthdr.len - AH_HMAC_HASHLEN, AH_HMAC_HASHLEN, aalg); ptr = (caddr_t) (tc + 1); /* Verify authenticator */ if (bcmp(ptr, aalg, AH_HMAC_HASHLEN) != 0) { DPRINTF(("%s: " "authentication hash mismatch for packet in SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); V_espstat.esps_badauth++; error = EACCES; goto bad; } } /* Remove trailing authenticator */ m_adj(m, -AH_HMAC_HASHLEN); } /* Release the crypto descriptors */ free(tc, M_XDATA), tc = NULL; crypto_freereq(crp), crp = NULL; /* * Packet is now decrypted. */ m->m_flags |= M_DECRYPTED; /* * Update replay sequence number, if appropriate. */ if (sav->replay) { u_int32_t seq; m_copydata(m, skip + offsetof(struct newesp, esp_seq), sizeof (seq), (caddr_t) &seq); if (ipsec_updatereplay(ntohl(seq), sav)) { DPRINTF(("%s: packet replay check for %s\n", __func__, ipsec_logsastr(sav))); V_espstat.esps_replay++; error = ENOBUFS; goto bad; } } /* Determine the ESP header length */ if (sav->flags & SADB_X_EXT_OLD) hlen = sizeof (struct esp) + sav->ivlen; else hlen = sizeof (struct newesp) + sav->ivlen; /* Remove the ESP header and IV from the mbuf. */ error = m_striphdr(m, skip, hlen); if (error) { V_espstat.esps_hdrops++; DPRINTF(("%s: bad mbuf chain, SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); goto bad; } /* Save the last three bytes of decrypted data */ m_copydata(m, m->m_pkthdr.len - 3, 3, lastthree); /* Verify pad length */ if (lastthree[1] + 2 > m->m_pkthdr.len - skip) { V_espstat.esps_badilen++; DPRINTF(("%s: invalid padding length %d for %u byte packet " "in SA %s/%08lx\n", __func__, lastthree[1], m->m_pkthdr.len - skip, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); error = EINVAL; goto bad; } /* Verify correct decryption by checking the last padding bytes */ if ((sav->flags & SADB_X_EXT_PMASK) != SADB_X_EXT_PRAND) { if (lastthree[1] != lastthree[0] && lastthree[1] != 0) { V_espstat.esps_badenc++; DPRINTF(("%s: decryption failed for packet in " "SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); error = EINVAL; goto bad; } } /* Trim the mbuf chain to remove trailing authenticator and padding */ m_adj(m, -(lastthree[1] + 2)); /* Restore the Next Protocol field */ m_copyback(m, protoff, sizeof (u_int8_t), lastthree + 2); IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag); KEY_FREESAV(&sav); return error; bad: if (sav) KEY_FREESAV(&sav); if (m != NULL) m_freem(m); if (tc != NULL) free(tc, M_XDATA); if (crp != NULL) crypto_freereq(crp); return error; } /* * ESP output routine, called by ipsec[46]_process_packet(). */ static int esp_output( struct mbuf *m, struct ipsecrequest *isr, struct mbuf **mp, int skip, int protoff ) { INIT_VNET_IPSEC(curvnet); struct enc_xform *espx; struct auth_hash *esph; int hlen, rlen, plen, padding, blks, alen, i, roff; struct mbuf *mo = (struct mbuf *) NULL; struct tdb_crypto *tc; struct secasvar *sav; struct secasindex *saidx; unsigned char *pad; u_int8_t prot; int error, maxpacketsize; struct cryptodesc *crde = NULL, *crda = NULL; struct cryptop *crp; sav = isr->sav; IPSEC_ASSERT(sav != NULL, ("null SA")); esph = sav->tdb_authalgxform; espx = sav->tdb_encalgxform; IPSEC_ASSERT(espx != NULL, ("null encoding xform")); if (sav->flags & SADB_X_EXT_OLD) hlen = sizeof (struct esp) + sav->ivlen; else hlen = sizeof (struct newesp) + sav->ivlen; rlen = m->m_pkthdr.len - skip; /* Raw payload length. */ /* * NB: The null encoding transform has a blocksize of 4 * so that headers are properly aligned. */ blks = espx->blocksize; /* IV blocksize */ /* XXX clamp padding length a la KAME??? */ padding = ((blks - ((rlen + 2) % blks)) % blks) + 2; plen = rlen + padding; /* Padded payload length. */ if (esph) alen = AH_HMAC_HASHLEN; else alen = 0; V_espstat.esps_output++; saidx = &sav->sah->saidx; /* Check for maximum packet size violations. */ switch (saidx->dst.sa.sa_family) { #ifdef INET case AF_INET: maxpacketsize = IP_MAXPACKET; break; #endif /* INET */ #ifdef INET6 case AF_INET6: maxpacketsize = IPV6_MAXPACKET; break; #endif /* INET6 */ default: DPRINTF(("%s: unknown/unsupported protocol " "family %d, SA %s/%08lx\n", __func__, saidx->dst.sa.sa_family, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); V_espstat.esps_nopf++; error = EPFNOSUPPORT; goto bad; } if (skip + hlen + rlen + padding + alen > maxpacketsize) { DPRINTF(("%s: packet in SA %s/%08lx got too big " "(len %u, max len %u)\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi), skip + hlen + rlen + padding + alen, maxpacketsize)); V_espstat.esps_toobig++; error = EMSGSIZE; goto bad; } /* Update the counters. */ V_espstat.esps_obytes += m->m_pkthdr.len - skip; m = m_unshare(m, M_NOWAIT); if (m == NULL) { DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); V_espstat.esps_hdrops++; error = ENOBUFS; goto bad; } /* Inject ESP header. */ mo = m_makespace(m, skip, hlen, &roff); if (mo == NULL) { DPRINTF(("%s: %u byte ESP hdr inject failed for SA %s/%08lx\n", __func__, hlen, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); V_espstat.esps_hdrops++; /* XXX diffs from openbsd */ error = ENOBUFS; goto bad; } /* Initialize ESP header. */ bcopy((caddr_t) &sav->spi, mtod(mo, caddr_t) + roff, sizeof(u_int32_t)); if (sav->replay) { u_int32_t replay; #ifdef REGRESSION /* Emulate replay attack when ipsec_replay is TRUE. */ if (!V_ipsec_replay) #endif sav->replay->count++; replay = htonl(sav->replay->count); bcopy((caddr_t) &replay, mtod(mo, caddr_t) + roff + sizeof(u_int32_t), sizeof(u_int32_t)); } /* * Add padding -- better to do it ourselves than use the crypto engine, * although if/when we support compression, we'd have to do that. */ pad = (u_char *) m_pad(m, padding + alen); if (pad == NULL) { DPRINTF(("%s: m_pad failed for SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); m = NULL; /* NB: free'd by m_pad */ error = ENOBUFS; goto bad; } /* * Add padding: random, zero, or self-describing. * XXX catch unexpected setting */ switch (sav->flags & SADB_X_EXT_PMASK) { case SADB_X_EXT_PRAND: (void) read_random(pad, padding - 2); break; case SADB_X_EXT_PZERO: bzero(pad, padding - 2); break; case SADB_X_EXT_PSEQ: for (i = 0; i < padding - 2; i++) pad[i] = i+1; break; } /* Fix padding length and Next Protocol in padding itself. */ pad[padding - 2] = padding - 2; m_copydata(m, protoff, sizeof(u_int8_t), pad + padding - 1); /* Fix Next Protocol in IPv4/IPv6 header. */ prot = IPPROTO_ESP; m_copyback(m, protoff, sizeof(u_int8_t), (u_char *) &prot); /* Get crypto descriptors. */ crp = crypto_getreq(esph && espx ? 2 : 1); if (crp == NULL) { DPRINTF(("%s: failed to acquire crypto descriptors\n", __func__)); V_espstat.esps_crypto++; error = ENOBUFS; goto bad; } if (espx) { crde = crp->crp_desc; crda = crde->crd_next; /* Encryption descriptor. */ crde->crd_skip = skip + hlen; crde->crd_len = m->m_pkthdr.len - (skip + hlen + alen); crde->crd_flags = CRD_F_ENCRYPT; crde->crd_inject = skip + hlen - sav->ivlen; /* Encryption operation. */ crde->crd_alg = espx->type; crde->crd_key = sav->key_enc->key_data; crde->crd_klen = _KEYBITS(sav->key_enc); /* XXX Rounds ? */ } else crda = crp->crp_desc; /* IPsec-specific opaque crypto info. */ tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto), M_XDATA, M_NOWAIT|M_ZERO); if (tc == NULL) { crypto_freereq(crp); DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); V_espstat.esps_crypto++; error = ENOBUFS; goto bad; } /* Callback parameters */ tc->tc_isr = isr; tc->tc_spi = sav->spi; tc->tc_dst = saidx->dst; tc->tc_proto = saidx->proto; /* Crypto operation descriptor. */ crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */ crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = esp_output_cb; crp->crp_opaque = (caddr_t) tc; crp->crp_sid = sav->tdb_cryptoid; if (esph) { /* Authentication descriptor. */ crda->crd_skip = skip; crda->crd_len = m->m_pkthdr.len - (skip + alen); crda->crd_inject = m->m_pkthdr.len - alen; /* Authentication operation. */ crda->crd_alg = esph->type; crda->crd_key = sav->key_auth->key_data; crda->crd_klen = _KEYBITS(sav->key_auth); } return crypto_dispatch(crp); bad: if (m) m_freem(m); return (error); } /* * ESP output callback from the crypto driver. */ static int esp_output_cb(struct cryptop *crp) { INIT_VNET_IPSEC(curvnet); struct tdb_crypto *tc; struct ipsecrequest *isr; struct secasvar *sav; struct mbuf *m; int err, error; tc = (struct tdb_crypto *) crp->crp_opaque; IPSEC_ASSERT(tc != NULL, ("null opaque data area!")); m = (struct mbuf *) crp->crp_buf; isr = tc->tc_isr; IPSECREQUEST_LOCK(isr); sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi); if (sav == NULL) { V_espstat.esps_notdb++; DPRINTF(("%s: SA gone during crypto (SA %s/%08lx proto %u)\n", __func__, ipsec_address(&tc->tc_dst), (u_long) ntohl(tc->tc_spi), tc->tc_proto)); error = ENOBUFS; /*XXX*/ goto bad; } IPSEC_ASSERT(isr->sav == sav, ("SA changed was %p now %p\n", isr->sav, sav)); /* Check for crypto errors. */ if (crp->crp_etype) { /* Reset session ID. */ if (sav->tdb_cryptoid != 0) sav->tdb_cryptoid = crp->crp_sid; if (crp->crp_etype == EAGAIN) { KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); error = crypto_dispatch(crp); return error; } V_espstat.esps_noxform++; DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { V_espstat.esps_crypto++; DPRINTF(("%s: bogus returned buffer from crypto\n", __func__)); error = EINVAL; goto bad; } V_espstat.esps_hist[sav->alg_enc]++; if (sav->tdb_authalgxform != NULL) V_ahstat.ahs_hist[sav->alg_auth]++; /* Release crypto descriptors. */ free(tc, M_XDATA); crypto_freereq(crp); #ifdef REGRESSION /* Emulate man-in-the-middle attack when ipsec_integrity is TRUE. */ if (V_ipsec_integrity) { static unsigned char ipseczeroes[AH_HMAC_HASHLEN]; struct auth_hash *esph; /* * Corrupt HMAC if we want to test integrity verification of * the other side. */ esph = sav->tdb_authalgxform; if (esph != NULL) { m_copyback(m, m->m_pkthdr.len - AH_HMAC_HASHLEN, AH_HMAC_HASHLEN, ipseczeroes); } } #endif /* NB: m is reclaimed by ipsec_process_done. */ err = ipsec_process_done(m, isr); KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); return err; bad: if (sav) KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); if (m) m_freem(m); free(tc, M_XDATA); crypto_freereq(crp); return error; } static struct xformsw esp_xformsw = { XF_ESP, XFT_CONF|XFT_AUTH, "IPsec ESP", esp_init, esp_zeroize, esp_input, esp_output }; static void esp_attach(void) { #define MAXIV(xform) \ if (xform.blocksize > V_esp_max_ivlen) \ V_esp_max_ivlen = xform.blocksize \ + V_esp_enable = 1; V_esp_max_ivlen = 0; + MAXIV(enc_xform_des); /* SADB_EALG_DESCBC */ MAXIV(enc_xform_3des); /* SADB_EALG_3DESCBC */ MAXIV(enc_xform_rijndael128); /* SADB_X_EALG_AES */ MAXIV(enc_xform_blf); /* SADB_X_EALG_BLOWFISHCBC */ MAXIV(enc_xform_cast5); /* SADB_X_EALG_CAST128CBC */ MAXIV(enc_xform_skipjack); /* SADB_X_EALG_SKIPJACK */ MAXIV(enc_xform_null); /* SADB_EALG_NULL */ MAXIV(enc_xform_camellia); /* SADB_X_EALG_CAMELLIACBC */ xform_register(&esp_xformsw); #undef MAXIV } SYSINIT(esp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, esp_attach, NULL); Index: head/sys/netipsec/xform_ipcomp.c =================================================================== --- head/sys/netipsec/xform_ipcomp.c (revision 185087) +++ head/sys/netipsec/xform_ipcomp.c (revision 185088) @@ -1,602 +1,606 @@ /* $FreeBSD$ */ /* $OpenBSD: ip_ipcomp.c,v 1.1 2001/07/05 12:08:52 jjbg Exp $ */ /*- * Copyright (c) 2001 Jean-Jacques Bernard-Gundol (jj@wabbitt.org) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* IP payload compression protocol (IPComp), see RFC 2393 */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #include #include #include #include #include #include #include -int ipcomp_enable = 0; +#ifdef VIMAGE_GLOBALS +int ipcomp_enable; struct ipcompstat ipcompstat; +#endif SYSCTL_DECL(_net_inet_ipcomp); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipcomp, OID_AUTO, ipcomp_enable, CTLFLAG_RW, ipcomp_enable, 0, ""); SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ipcomp, IPSECCTL_STATS, stats, CTLFLAG_RD, ipcompstat, ipcompstat, ""); static int ipcomp_input_cb(struct cryptop *crp); static int ipcomp_output_cb(struct cryptop *crp); struct comp_algo * ipcomp_algorithm_lookup(int alg) { if (alg >= IPCOMP_ALG_MAX) return NULL; switch (alg) { case SADB_X_CALG_DEFLATE: return &comp_algo_deflate; } return NULL; } /* * ipcomp_init() is called when an CPI is being set up. */ static int ipcomp_init(struct secasvar *sav, struct xformsw *xsp) { INIT_VNET_IPSEC(curvnet); struct comp_algo *tcomp; struct cryptoini cric; /* NB: algorithm really comes in alg_enc and not alg_comp! */ tcomp = ipcomp_algorithm_lookup(sav->alg_enc); if (tcomp == NULL) { DPRINTF(("%s: unsupported compression algorithm %d\n", __func__, sav->alg_comp)); return EINVAL; } sav->alg_comp = sav->alg_enc; /* set for doing histogram */ sav->tdb_xform = xsp; sav->tdb_compalgxform = tcomp; /* Initialize crypto session */ bzero(&cric, sizeof (cric)); cric.cri_alg = sav->tdb_compalgxform->type; return crypto_newsession(&sav->tdb_cryptoid, &cric, V_crypto_support); } /* * ipcomp_zeroize() used when IPCA is deleted */ static int ipcomp_zeroize(struct secasvar *sav) { int err; err = crypto_freesession(sav->tdb_cryptoid); sav->tdb_cryptoid = 0; return err; } /* * ipcomp_input() gets called to uncompress an input packet */ static int ipcomp_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) { INIT_VNET_IPSEC(curvnet); struct tdb_crypto *tc; struct cryptodesc *crdc; struct cryptop *crp; int hlen = IPCOMP_HLENGTH; /* Get crypto descriptors */ crp = crypto_getreq(1); if (crp == NULL) { m_freem(m); DPRINTF(("%s: no crypto descriptors\n", __func__)); V_ipcompstat.ipcomps_crypto++; return ENOBUFS; } /* Get IPsec-specific opaque pointer */ tc = (struct tdb_crypto *) malloc(sizeof (*tc), M_XDATA, M_NOWAIT|M_ZERO); if (tc == NULL) { m_freem(m); crypto_freereq(crp); DPRINTF(("%s: cannot allocate tdb_crypto\n", __func__)); V_ipcompstat.ipcomps_crypto++; return ENOBUFS; } crdc = crp->crp_desc; crdc->crd_skip = skip + hlen; crdc->crd_len = m->m_pkthdr.len - (skip + hlen); crdc->crd_inject = skip; tc->tc_ptr = 0; /* Decompression operation */ crdc->crd_alg = sav->tdb_compalgxform->type; /* Crypto operation descriptor */ crp->crp_ilen = m->m_pkthdr.len - (skip + hlen); crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = ipcomp_input_cb; crp->crp_sid = sav->tdb_cryptoid; crp->crp_opaque = (caddr_t) tc; /* These are passed as-is to the callback */ tc->tc_spi = sav->spi; tc->tc_dst = sav->sah->saidx.dst; tc->tc_proto = sav->sah->saidx.proto; tc->tc_protoff = protoff; tc->tc_skip = skip; return crypto_dispatch(crp); } #ifdef INET6 #define IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag) do { \ if (saidx->dst.sa.sa_family == AF_INET6) { \ error = ipsec6_common_input_cb(m, sav, skip, protoff, mtag); \ } else { \ error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag); \ } \ } while (0) #else #define IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag) \ (error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag)) #endif /* * IPComp input callback from the crypto driver. */ static int ipcomp_input_cb(struct cryptop *crp) { INIT_VNET_IPSEC(curvnet); struct cryptodesc *crd; struct tdb_crypto *tc; int skip, protoff; struct mtag *mtag; struct mbuf *m; struct secasvar *sav; struct secasindex *saidx; int hlen = IPCOMP_HLENGTH, error, clen; u_int8_t nproto; caddr_t addr; crd = crp->crp_desc; tc = (struct tdb_crypto *) crp->crp_opaque; IPSEC_ASSERT(tc != NULL, ("null opaque crypto data area!")); skip = tc->tc_skip; protoff = tc->tc_protoff; mtag = (struct mtag *) tc->tc_ptr; m = (struct mbuf *) crp->crp_buf; sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi); if (sav == NULL) { V_ipcompstat.ipcomps_notdb++; DPRINTF(("%s: SA expired while in crypto\n", __func__)); error = ENOBUFS; /*XXX*/ goto bad; } saidx = &sav->sah->saidx; IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET || saidx->dst.sa.sa_family == AF_INET6, ("unexpected protocol family %u", saidx->dst.sa.sa_family)); /* Check for crypto errors */ if (crp->crp_etype) { /* Reset the session ID */ if (sav->tdb_cryptoid != 0) sav->tdb_cryptoid = crp->crp_sid; if (crp->crp_etype == EAGAIN) { KEY_FREESAV(&sav); error = crypto_dispatch(crp); return error; } V_ipcompstat.ipcomps_noxform++; DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { V_ipcompstat.ipcomps_crypto++; DPRINTF(("%s: null mbuf returned from crypto\n", __func__)); error = EINVAL; goto bad; } V_ipcompstat.ipcomps_hist[sav->alg_comp]++; clen = crp->crp_olen; /* Length of data after processing */ /* Release the crypto descriptors */ free(tc, M_XDATA), tc = NULL; crypto_freereq(crp), crp = NULL; /* In case it's not done already, adjust the size of the mbuf chain */ m->m_pkthdr.len = clen + hlen + skip; if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == 0) { V_ipcompstat.ipcomps_hdrops++; /*XXX*/ DPRINTF(("%s: m_pullup failed\n", __func__)); error = EINVAL; /*XXX*/ goto bad; } /* Keep the next protocol field */ addr = (caddr_t) mtod(m, struct ip *) + skip; nproto = ((struct ipcomp *) addr)->comp_nxt; /* Remove the IPCOMP header */ error = m_striphdr(m, skip, hlen); if (error) { V_ipcompstat.ipcomps_hdrops++; DPRINTF(("%s: bad mbuf chain, IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); goto bad; } /* Restore the Next Protocol field */ m_copyback(m, protoff, sizeof (u_int8_t), (u_int8_t *) &nproto); IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, NULL); KEY_FREESAV(&sav); return error; bad: if (sav) KEY_FREESAV(&sav); if (m) m_freem(m); if (tc != NULL) free(tc, M_XDATA); if (crp) crypto_freereq(crp); return error; } /* * IPComp output routine, called by ipsec[46]_process_packet() */ static int ipcomp_output( struct mbuf *m, struct ipsecrequest *isr, struct mbuf **mp, int skip, int protoff ) { INIT_VNET_IPSEC(curvnet); struct secasvar *sav; struct comp_algo *ipcompx; int error, ralen, hlen, maxpacketsize, roff; u_int8_t prot; struct cryptodesc *crdc; struct cryptop *crp; struct tdb_crypto *tc; struct mbuf *mo; struct ipcomp *ipcomp; sav = isr->sav; IPSEC_ASSERT(sav != NULL, ("null SA")); ipcompx = sav->tdb_compalgxform; IPSEC_ASSERT(ipcompx != NULL, ("null compression xform")); ralen = m->m_pkthdr.len - skip; /* Raw payload length before comp. */ hlen = IPCOMP_HLENGTH; V_ipcompstat.ipcomps_output++; /* Check for maximum packet size violations. */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: maxpacketsize = IP_MAXPACKET; break; #endif /* INET */ #ifdef INET6 case AF_INET6: maxpacketsize = IPV6_MAXPACKET; break; #endif /* INET6 */ default: V_ipcompstat.ipcomps_nopf++; DPRINTF(("%s: unknown/unsupported protocol family %d, " "IPCA %s/%08lx\n", __func__, sav->sah->saidx.dst.sa.sa_family, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); error = EPFNOSUPPORT; goto bad; } if (skip + hlen + ralen > maxpacketsize) { V_ipcompstat.ipcomps_toobig++; DPRINTF(("%s: packet in IPCA %s/%08lx got too big " "(len %u, max len %u)\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi), skip + hlen + ralen, maxpacketsize)); error = EMSGSIZE; goto bad; } /* Update the counters */ V_ipcompstat.ipcomps_obytes += m->m_pkthdr.len - skip; m = m_unshare(m, M_NOWAIT); if (m == NULL) { V_ipcompstat.ipcomps_hdrops++; DPRINTF(("%s: cannot clone mbuf chain, IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); error = ENOBUFS; goto bad; } /* Inject IPCOMP header */ mo = m_makespace(m, skip, hlen, &roff); if (mo == NULL) { V_ipcompstat.ipcomps_wrap++; DPRINTF(("%s: IPCOMP header inject failed for IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); error = ENOBUFS; goto bad; } ipcomp = (struct ipcomp *)(mtod(mo, caddr_t) + roff); /* Initialize the IPCOMP header */ /* XXX alignment always correct? */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: ipcomp->comp_nxt = mtod(m, struct ip *)->ip_p; break; #endif /* INET */ #ifdef INET6 case AF_INET6: ipcomp->comp_nxt = mtod(m, struct ip6_hdr *)->ip6_nxt; break; #endif } ipcomp->comp_flags = 0; ipcomp->comp_cpi = htons((u_int16_t) ntohl(sav->spi)); /* Fix Next Protocol in IPv4/IPv6 header */ prot = IPPROTO_IPCOMP; m_copyback(m, protoff, sizeof(u_int8_t), (u_char *) &prot); /* Ok now, we can pass to the crypto processing */ /* Get crypto descriptors */ crp = crypto_getreq(1); if (crp == NULL) { V_ipcompstat.ipcomps_crypto++; DPRINTF(("%s: failed to acquire crypto descriptor\n",__func__)); error = ENOBUFS; goto bad; } crdc = crp->crp_desc; /* Compression descriptor */ crdc->crd_skip = skip + hlen; crdc->crd_len = m->m_pkthdr.len - (skip + hlen); crdc->crd_flags = CRD_F_COMP; crdc->crd_inject = skip + hlen; /* Compression operation */ crdc->crd_alg = ipcompx->type; /* IPsec-specific opaque crypto info */ tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto), M_XDATA, M_NOWAIT|M_ZERO); if (tc == NULL) { V_ipcompstat.ipcomps_crypto++; DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); crypto_freereq(crp); error = ENOBUFS; goto bad; } tc->tc_isr = isr; tc->tc_spi = sav->spi; tc->tc_dst = sav->sah->saidx.dst; tc->tc_proto = sav->sah->saidx.proto; tc->tc_skip = skip + hlen; /* Crypto operation descriptor */ crp->crp_ilen = m->m_pkthdr.len; /* Total input length */ crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = ipcomp_output_cb; crp->crp_opaque = (caddr_t) tc; crp->crp_sid = sav->tdb_cryptoid; return crypto_dispatch(crp); bad: if (m) m_freem(m); return (error); } /* * IPComp output callback from the crypto driver. */ static int ipcomp_output_cb(struct cryptop *crp) { INIT_VNET_IPSEC(curvnet); struct tdb_crypto *tc; struct ipsecrequest *isr; struct secasvar *sav; struct mbuf *m; int error, skip, rlen; tc = (struct tdb_crypto *) crp->crp_opaque; IPSEC_ASSERT(tc != NULL, ("null opaque data area!")); m = (struct mbuf *) crp->crp_buf; skip = tc->tc_skip; rlen = crp->crp_ilen - skip; isr = tc->tc_isr; IPSECREQUEST_LOCK(isr); sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi); if (sav == NULL) { V_ipcompstat.ipcomps_notdb++; DPRINTF(("%s: SA expired while in crypto\n", __func__)); error = ENOBUFS; /*XXX*/ goto bad; } IPSEC_ASSERT(isr->sav == sav, ("SA changed\n")); /* Check for crypto errors */ if (crp->crp_etype) { /* Reset session ID */ if (sav->tdb_cryptoid != 0) sav->tdb_cryptoid = crp->crp_sid; if (crp->crp_etype == EAGAIN) { KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); error = crypto_dispatch(crp); return error; } V_ipcompstat.ipcomps_noxform++; DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { V_ipcompstat.ipcomps_crypto++; DPRINTF(("%s: bogus return buffer from crypto\n", __func__)); error = EINVAL; goto bad; } V_ipcompstat.ipcomps_hist[sav->alg_comp]++; if (rlen > crp->crp_olen) { /* Adjust the length in the IP header */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: mtod(m, struct ip *)->ip_len = htons(m->m_pkthdr.len); break; #endif /* INET */ #ifdef INET6 case AF_INET6: mtod(m, struct ip6_hdr *)->ip6_plen = htons(m->m_pkthdr.len) - sizeof(struct ip6_hdr); break; #endif /* INET6 */ default: V_ipcompstat.ipcomps_nopf++; DPRINTF(("%s: unknown/unsupported protocol " "family %d, IPCA %s/%08lx\n", __func__, sav->sah->saidx.dst.sa.sa_family, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); error = EPFNOSUPPORT; goto bad; } } else { /* compression was useless, we have lost time */ /* XXX add statistic */ } /* Release the crypto descriptor */ free(tc, M_XDATA); crypto_freereq(crp); /* NB: m is reclaimed by ipsec_process_done. */ error = ipsec_process_done(m, isr); KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); return error; bad: if (sav) KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); if (m) m_freem(m); free(tc, M_XDATA); crypto_freereq(crp); return error; } static struct xformsw ipcomp_xformsw = { XF_IPCOMP, XFT_COMP, "IPcomp", ipcomp_init, ipcomp_zeroize, ipcomp_input, ipcomp_output }; static void ipcomp_attach(void) { + + V_ipcomp_enable = 0; xform_register(&ipcomp_xformsw); } SYSINIT(ipcomp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipcomp_attach, NULL); Index: head/sys/netipsec/xform_ipip.c =================================================================== --- head/sys/netipsec/xform_ipip.c (revision 185087) +++ head/sys/netipsec/xform_ipip.c (revision 185088) @@ -1,708 +1,713 @@ /* $FreeBSD$ */ /* $OpenBSD: ip_ipip.c,v 1.25 2002/06/10 18:04:55 itojun Exp $ */ /*- * The authors of this code are John Ioannidis (ji@tla.org), * Angelos D. Keromytis (kermit@csd.uch.gr) and * Niels Provos (provos@physnet.uni-hamburg.de). * * The original version of this code was written by John Ioannidis * for BSD/OS in Athens, Greece, in November 1995. * * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996, * by Angelos D. Keromytis. * * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis * and Niels Provos. * * Additional features in 1999 by Angelos D. Keromytis. * * Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis, * Angelos D. Keromytis and Niels Provos. * Copyright (c) 2001, Angelos D. Keromytis. * * Permission to use, copy, and modify this software with or without fee * is hereby granted, provided that this entire notice is included in * all copies of any software which is or includes a copy or * modification of this software. * You may use this code under the GNU public license if you so wish. Please * contribute changes back to the authors under this freer than GPL license * so that we may further the use of strong encryption without limitations to * all. * * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR * PURPOSE. */ /* * IP-inside-IP processing */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_enc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef MROUTING #include #endif #ifdef INET6 #include #include #include #include #include #endif #include #include #include /* * We can control the acceptance of IP4 packets by altering the sysctl * net.inet.ipip.allow value. Zero means drop them, all else is acceptance. */ -int ipip_allow = 0; +#ifdef VIMAGE_GLOBALS +int ipip_allow; struct ipipstat ipipstat; +#endif SYSCTL_DECL(_net_inet_ipip); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipip, OID_AUTO, ipip_allow, CTLFLAG_RW, ipip_allow, 0, ""); SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ipip, IPSECCTL_STATS, stats, CTLFLAG_RD, ipipstat, ipipstat, ""); /* XXX IPCOMP */ #define M_IPSEC (M_AUTHIPHDR|M_AUTHIPDGM|M_DECRYPTED) static void _ipip_input(struct mbuf *m, int iphlen, struct ifnet *gifp); #ifdef INET6 /* * Really only a wrapper for ipip_input(), for use with IPv6. */ int ip4_input6(struct mbuf **m, int *offp, int proto) { #if 0 /* If we do not accept IP-in-IP explicitly, drop. */ if (!V_ipip_allow && ((*m)->m_flags & M_IPSEC) == 0) { DPRINTF(("%s: dropped due to policy\n", __func__)); V_ipipstat.ipips_pdrops++; m_freem(*m); return IPPROTO_DONE; } #endif _ipip_input(*m, *offp, NULL); return IPPROTO_DONE; } #endif /* INET6 */ #ifdef INET /* * Really only a wrapper for ipip_input(), for use with IPv4. */ void ip4_input(struct mbuf *m, int off) { #if 0 /* If we do not accept IP-in-IP explicitly, drop. */ if (!V_ipip_allow && (m->m_flags & M_IPSEC) == 0) { DPRINTF(("%s: dropped due to policy\n", __func__)); V_ipipstat.ipips_pdrops++; m_freem(m); return; } #endif _ipip_input(m, off, NULL); } #endif /* INET */ /* * ipip_input gets called when we receive an IP{46} encapsulated packet, * either because we got it at a real interface, or because AH or ESP * were being used in tunnel mode (in which case the rcvif element will * contain the address of the encX interface associated with the tunnel. */ static void _ipip_input(struct mbuf *m, int iphlen, struct ifnet *gifp) { INIT_VNET_NET(curvnet); INIT_VNET_IPSEC(curvnet); register struct sockaddr_in *sin; register struct ifnet *ifp; register struct ifaddr *ifa; struct ip *ipo; #ifdef INET6 register struct sockaddr_in6 *sin6; struct ip6_hdr *ip6 = NULL; u_int8_t itos; #endif u_int8_t nxt; int isr; u_int8_t otos; u_int8_t v; int hlen; V_ipipstat.ipips_ipackets++; m_copydata(m, 0, 1, &v); switch (v >> 4) { #ifdef INET case 4: hlen = sizeof(struct ip); break; #endif /* INET */ #ifdef INET6 case 6: hlen = sizeof(struct ip6_hdr); break; #endif default: V_ipipstat.ipips_family++; m_freem(m); return /* EAFNOSUPPORT */; } /* Bring the IP header in the first mbuf, if not there already */ if (m->m_len < hlen) { if ((m = m_pullup(m, hlen)) == NULL) { DPRINTF(("%s: m_pullup (1) failed\n", __func__)); V_ipipstat.ipips_hdrops++; return; } } ipo = mtod(m, struct ip *); #ifdef MROUTING if (ipo->ip_v == IPVERSION && ipo->ip_p == IPPROTO_IPV4) { if (IN_MULTICAST(((struct ip *)((char *) ipo + iphlen))->ip_dst.s_addr)) { ipip_mroute_input (m, iphlen); return; } } #endif /* MROUTING */ /* Keep outer ecn field. */ switch (v >> 4) { #ifdef INET case 4: otos = ipo->ip_tos; break; #endif /* INET */ #ifdef INET6 case 6: otos = (ntohl(mtod(m, struct ip6_hdr *)->ip6_flow) >> 20) & 0xff; break; #endif default: panic("ipip_input: unknown ip version %u (outer)", v>>4); } /* Remove outer IP header */ m_adj(m, iphlen); /* Sanity check */ if (m->m_pkthdr.len < sizeof(struct ip)) { V_ipipstat.ipips_hdrops++; m_freem(m); return; } m_copydata(m, 0, 1, &v); switch (v >> 4) { #ifdef INET case 4: hlen = sizeof(struct ip); break; #endif /* INET */ #ifdef INET6 case 6: hlen = sizeof(struct ip6_hdr); break; #endif default: V_ipipstat.ipips_family++; m_freem(m); return; /* EAFNOSUPPORT */ } /* * Bring the inner IP header in the first mbuf, if not there already. */ if (m->m_len < hlen) { if ((m = m_pullup(m, hlen)) == NULL) { DPRINTF(("%s: m_pullup (2) failed\n", __func__)); V_ipipstat.ipips_hdrops++; return; } } /* * RFC 1853 specifies that the inner TTL should not be touched on * decapsulation. There's no reason this comment should be here, but * this is as good as any a position. */ /* Some sanity checks in the inner IP header */ switch (v >> 4) { #ifdef INET case 4: ipo = mtod(m, struct ip *); nxt = ipo->ip_p; ip_ecn_egress(V_ip4_ipsec_ecn, &otos, &ipo->ip_tos); break; #endif /* INET */ #ifdef INET6 case 6: ip6 = (struct ip6_hdr *) ipo; nxt = ip6->ip6_nxt; itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; ip_ecn_egress(V_ip6_ipsec_ecn, &otos, &itos); ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t) itos << 20); break; #endif default: panic("ipip_input: unknown ip version %u (inner)", v>>4); } /* Check for local address spoofing. */ if ((m->m_pkthdr.rcvif == NULL || !(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK)) && V_ipip_allow != 2) { IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { #ifdef INET if (ipo) { if (ifa->ifa_addr->sa_family != AF_INET) continue; sin = (struct sockaddr_in *) ifa->ifa_addr; if (sin->sin_addr.s_addr == ipo->ip_src.s_addr) { V_ipipstat.ipips_spoof++; m_freem(m); IFNET_RUNLOCK(); return; } } #endif /* INET */ #ifdef INET6 if (ip6) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *) ifa->ifa_addr; if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &ip6->ip6_src)) { V_ipipstat.ipips_spoof++; m_freem(m); IFNET_RUNLOCK(); return; } } #endif /* INET6 */ } } IFNET_RUNLOCK(); } /* Statistics */ V_ipipstat.ipips_ibytes += m->m_pkthdr.len - iphlen; #ifdef DEV_ENC switch (v >> 4) { #ifdef INET case 4: ipsec_bpf(m, NULL, AF_INET, ENC_IN|ENC_AFTER); break; #endif #ifdef INET6 case 6: ipsec_bpf(m, NULL, AF_INET6, ENC_IN|ENC_AFTER); break; #endif default: panic("%s: bogus ip version %u", __func__, v>>4); } /* pass the mbuf to enc0 for packet filtering */ if (ipsec_filter(&m, PFIL_IN, ENC_IN|ENC_AFTER) != 0) return; #endif /* * Interface pointer stays the same; if no IPsec processing has * been done (or will be done), this will point to a normal * interface. Otherwise, it'll point to an enc interface, which * will allow a packet filter to distinguish between secure and * untrusted packets. */ switch (v >> 4) { #ifdef INET case 4: isr = NETISR_IP; break; #endif #ifdef INET6 case 6: isr = NETISR_IPV6; break; #endif default: panic("%s: bogus ip version %u", __func__, v>>4); } if (netisr_queue(isr, m)) { /* (0) on success. */ V_ipipstat.ipips_qfull++; DPRINTF(("%s: packet dropped because of full queue\n", __func__)); } } int ipip_output( struct mbuf *m, struct ipsecrequest *isr, struct mbuf **mp, int skip, int protoff ) { INIT_VNET_IPSEC(curvnet); #ifdef INET INIT_VNET_INET(curvnet); #endif /* INET */ struct secasvar *sav; u_int8_t tp, otos; struct secasindex *saidx; int error; #ifdef INET u_int8_t itos; struct ip *ipo; #endif /* INET */ #ifdef INET6 struct ip6_hdr *ip6, *ip6o; #endif /* INET6 */ sav = isr->sav; IPSEC_ASSERT(sav != NULL, ("null SA")); IPSEC_ASSERT(sav->sah != NULL, ("null SAH")); /* XXX Deal with empty TDB source/destination addresses. */ m_copydata(m, 0, 1, &tp); tp = (tp >> 4) & 0xff; /* Get the IP version number. */ saidx = &sav->sah->saidx; switch (saidx->dst.sa.sa_family) { #ifdef INET case AF_INET: if (saidx->src.sa.sa_family != AF_INET || saidx->src.sin.sin_addr.s_addr == INADDR_ANY || saidx->dst.sin.sin_addr.s_addr == INADDR_ANY) { DPRINTF(("%s: unspecified tunnel endpoint " "address in SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); V_ipipstat.ipips_unspec++; error = EINVAL; goto bad; } M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); if (m == 0) { DPRINTF(("%s: M_PREPEND failed\n", __func__)); V_ipipstat.ipips_hdrops++; error = ENOBUFS; goto bad; } ipo = mtod(m, struct ip *); ipo->ip_v = IPVERSION; ipo->ip_hl = 5; ipo->ip_len = htons(m->m_pkthdr.len); ipo->ip_ttl = V_ip_defttl; ipo->ip_sum = 0; ipo->ip_src = saidx->src.sin.sin_addr; ipo->ip_dst = saidx->dst.sin.sin_addr; ipo->ip_id = ip_newid(); /* If the inner protocol is IP... */ if (tp == IPVERSION) { /* Save ECN notification */ m_copydata(m, sizeof(struct ip) + offsetof(struct ip, ip_tos), sizeof(u_int8_t), (caddr_t) &itos); ipo->ip_p = IPPROTO_IPIP; /* * We should be keeping tunnel soft-state and * send back ICMPs if needed. */ m_copydata(m, sizeof(struct ip) + offsetof(struct ip, ip_off), sizeof(u_int16_t), (caddr_t) &ipo->ip_off); ipo->ip_off = ntohs(ipo->ip_off); ipo->ip_off &= ~(IP_DF | IP_MF | IP_OFFMASK); ipo->ip_off = htons(ipo->ip_off); } #ifdef INET6 else if (tp == (IPV6_VERSION >> 4)) { u_int32_t itos32; /* Save ECN notification. */ m_copydata(m, sizeof(struct ip) + offsetof(struct ip6_hdr, ip6_flow), sizeof(u_int32_t), (caddr_t) &itos32); itos = ntohl(itos32) >> 20; ipo->ip_p = IPPROTO_IPV6; ipo->ip_off = 0; } #endif /* INET6 */ else { goto nofamily; } otos = 0; ip_ecn_ingress(ECN_ALLOWED, &otos, &itos); ipo->ip_tos = otos; break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&saidx->dst.sin6.sin6_addr) || saidx->src.sa.sa_family != AF_INET6 || IN6_IS_ADDR_UNSPECIFIED(&saidx->src.sin6.sin6_addr)) { DPRINTF(("%s: unspecified tunnel endpoint " "address in SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); V_ipipstat.ipips_unspec++; error = ENOBUFS; goto bad; } /* scoped address handling */ ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) ip6->ip6_src.s6_addr16[1] = 0; if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) ip6->ip6_dst.s6_addr16[1] = 0; M_PREPEND(m, sizeof(struct ip6_hdr), M_DONTWAIT); if (m == 0) { DPRINTF(("%s: M_PREPEND failed\n", __func__)); V_ipipstat.ipips_hdrops++; error = ENOBUFS; goto bad; } /* Initialize IPv6 header */ ip6o = mtod(m, struct ip6_hdr *); ip6o->ip6_flow = 0; ip6o->ip6_vfc &= ~IPV6_VERSION_MASK; ip6o->ip6_vfc |= IPV6_VERSION; ip6o->ip6_plen = htons(m->m_pkthdr.len); ip6o->ip6_hlim = V_ip_defttl; ip6o->ip6_dst = saidx->dst.sin6.sin6_addr; ip6o->ip6_src = saidx->src.sin6.sin6_addr; #ifdef INET if (tp == IPVERSION) { /* Save ECN notification */ m_copydata(m, sizeof(struct ip6_hdr) + offsetof(struct ip, ip_tos), sizeof(u_int8_t), (caddr_t) &itos); /* This is really IPVERSION. */ ip6o->ip6_nxt = IPPROTO_IPIP; } else #endif /* INET */ if (tp == (IPV6_VERSION >> 4)) { u_int32_t itos32; /* Save ECN notification. */ m_copydata(m, sizeof(struct ip6_hdr) + offsetof(struct ip6_hdr, ip6_flow), sizeof(u_int32_t), (caddr_t) &itos32); itos = ntohl(itos32) >> 20; ip6o->ip6_nxt = IPPROTO_IPV6; } else { goto nofamily; } otos = 0; ip_ecn_ingress(ECN_ALLOWED, &otos, &itos); ip6o->ip6_flow |= htonl((u_int32_t) otos << 20); break; #endif /* INET6 */ default: nofamily: DPRINTF(("%s: unsupported protocol family %u\n", __func__, saidx->dst.sa.sa_family)); V_ipipstat.ipips_family++; error = EAFNOSUPPORT; /* XXX diffs from openbsd */ goto bad; } V_ipipstat.ipips_opackets++; *mp = m; #ifdef INET if (saidx->dst.sa.sa_family == AF_INET) { #if 0 if (sav->tdb_xform->xf_type == XF_IP4) tdb->tdb_cur_bytes += m->m_pkthdr.len - sizeof(struct ip); #endif V_ipipstat.ipips_obytes += m->m_pkthdr.len - sizeof(struct ip); } #endif /* INET */ #ifdef INET6 if (saidx->dst.sa.sa_family == AF_INET6) { #if 0 if (sav->tdb_xform->xf_type == XF_IP4) tdb->tdb_cur_bytes += m->m_pkthdr.len - sizeof(struct ip6_hdr); #endif V_ipipstat.ipips_obytes += m->m_pkthdr.len - sizeof(struct ip6_hdr); } #endif /* INET6 */ return 0; bad: if (m) m_freem(m); *mp = NULL; return (error); } #ifdef IPSEC static int ipe4_init(struct secasvar *sav, struct xformsw *xsp) { sav->tdb_xform = xsp; return 0; } static int ipe4_zeroize(struct secasvar *sav) { sav->tdb_xform = NULL; return 0; } static int ipe4_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) { /* This is a rather serious mistake, so no conditional printing. */ printf("%s: should never be called\n", __func__); if (m) m_freem(m); return EOPNOTSUPP; } static struct xformsw ipe4_xformsw = { XF_IP4, 0, "IPv4 Simple Encapsulation", ipe4_init, ipe4_zeroize, ipe4_input, ipip_output, }; extern struct domain inetdomain; static struct protosw ipe4_protosw = { SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR|PR_LASTHDR, ip4_input, 0, 0, rip_ctloutput, 0, 0, 0, 0, 0, &rip_usrreqs }; #ifdef INET6 static struct ip6protosw ipe6_protosw = { SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR, ip4_input6, 0, 0, rip_ctloutput, 0, 0, 0, 0, 0, &rip_usrreqs }; #endif /* * Check the encapsulated packet to see if we want it */ static int ipe4_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { /* * Only take packets coming from IPSEC tunnels; the rest * must be handled by the gif tunnel code. Note that we * also return a minimum priority when we want the packet * so any explicit gif tunnels take precedence. */ return ((m->m_flags & M_IPSEC) != 0 ? 1 : 0); } static void ipe4_attach(void) { + + V_ipip_allow = 0; + xform_register(&ipe4_xformsw); /* attach to encapsulation framework */ /* XXX save return cookie for detach on module remove */ (void) encap_attach_func(AF_INET, -1, ipe4_encapcheck, &ipe4_protosw, NULL); #ifdef INET6 (void) encap_attach_func(AF_INET6, -1, ipe4_encapcheck, (struct protosw *)&ipe6_protosw, NULL); #endif } SYSINIT(ipe4_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipe4_attach, NULL); #endif /* IPSEC */ Index: head/sys/sys/vimage.h =================================================================== --- head/sys/sys/vimage.h (revision 185087) +++ head/sys/sys/vimage.h (revision 185088) @@ -1,66 +1,68 @@ /*- * Copyright (c) 2006-2008 University of Zagreb * Copyright (c) 2006-2008 FreeBSD Foundation * * This software was developed by the University of Zagreb and the * FreeBSD Foundation under sponsorship by the Stichting NLnet and the * FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_VIMAGE_H_ #define _SYS_VIMAGE_H_ +#define VIMAGE_GLOBALS 1 + /* Non-VIMAGE null-macros */ #define CURVNET_SET(arg) #define CURVNET_SET_QUIET(arg) #define CURVNET_RESTORE() #define VNET_ASSERT(condition) #define VSYM(base, sym) (sym) #define INIT_FROM_VNET(vnet, modindex, modtype, sym) #define VNET_ITERATOR_DECL(arg) #define VNET_FOREACH(arg) #define VNET_LIST_RLOCK() #define VNET_LIST_RUNLOCK() #define INIT_VPROCG(arg) #define INIT_VCPU(arg) #define TD_TO_VIMAGE(td) #define TD_TO_VNET(td) #define TD_TO_VPROCG(td) #define TD_TO_VCPU(td) #define P_TO_VIMAGE(p) #define P_TO_VNET(p) #define P_TO_VPROCG(p) #define P_TO_VCPU(p) /* XXX those defines bellow should probably go into vprocg.h and vcpu.h */ #define VPROCG(sym) VSYM(vprocg, sym) #define VCPU(sym) VSYM(vcpu, sym) #define V_hostname VPROCG(hostname) #define G_hostname VSYM(basevprocg, hostname) /* global hostname */ #define V_domainname VPROCG(domainname) #endif /* !_SYS_VIMAGE_H_ */