diff --git a/sys/dev/firewire/if_fwip.c b/sys/dev/firewire/if_fwip.c index 5237c555d999..b698db6c9620 100644 --- a/sys/dev/firewire/if_fwip.c +++ b/sys/dev/firewire/if_fwip.c @@ -1,937 +1,937 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2004 * Doug Rabson * Copyright (c) 2002-2003 * Hidetoshi Shimokawa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * * This product includes software developed by Hidetoshi Shimokawa. * * 4. Neither the name of the author nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #ifdef HAVE_KERNEL_OPTION_HEADERS #include "opt_device_polling.h" #include "opt_inet.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * We really need a mechanism for allocating regions in the FIFO * address space. We pick a address in the OHCI controller's 'middle' * address space. This means that the controller will automatically * send responses for us, which is fine since we don't have any * important information to put in the response anyway. */ #define INET_FIFO 0xfffe00000000LL #define FWIPDEBUG if (fwipdebug) if_printf #define TX_MAX_QUEUE (FWMAXQUEUE - 1) /* network interface */ static void fwip_start (if_t); static int fwip_ioctl (if_t, u_long, caddr_t); static void fwip_init (void *); static void fwip_post_busreset (void *); static void fwip_output_callback (struct fw_xfer *); static void fwip_async_output (struct fwip_softc *, if_t); static void fwip_start_send (void *, int); static void fwip_stream_input (struct fw_xferq *); static void fwip_unicast_input(struct fw_xfer *); static int fwipdebug = 0; static int broadcast_channel = 0xc0 | 0x1f; /* tag | channel(XXX) */ static int tx_speed = 2; static int rx_queue_len = FWMAXQUEUE; static MALLOC_DEFINE(M_FWIP, "if_fwip", "IP over FireWire interface"); SYSCTL_INT(_debug, OID_AUTO, if_fwip_debug, CTLFLAG_RW, &fwipdebug, 0, ""); SYSCTL_DECL(_hw_firewire); static SYSCTL_NODE(_hw_firewire, OID_AUTO, fwip, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Firewire ip subsystem"); SYSCTL_INT(_hw_firewire_fwip, OID_AUTO, rx_queue_len, CTLFLAG_RWTUN, &rx_queue_len, 0, "Length of the receive queue"); #ifdef DEVICE_POLLING static poll_handler_t fwip_poll; static int fwip_poll(if_t ifp, enum poll_cmd cmd, int count) { struct fwip_softc *fwip; struct firewire_comm *fc; if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) return (0); fwip = ((struct fwip_eth_softc *)if_getsoftc(ifp))->fwip; fc = fwip->fd.fc; fc->poll(fc, (cmd == POLL_AND_CHECK_STATUS)?0:1, count); return (0); } #endif /* DEVICE_POLLING */ static void fwip_identify(driver_t *driver, device_t parent) { BUS_ADD_CHILD(parent, 0, "fwip", device_get_unit(parent)); } static int fwip_probe(device_t dev) { device_t pa; pa = device_get_parent(dev); if (device_get_unit(dev) != device_get_unit(pa)) { return (ENXIO); } device_set_desc(dev, "IP over FireWire"); return (0); } static int fwip_attach(device_t dev) { struct fwip_softc *fwip; if_t ifp; int unit, s; struct fw_hwaddr *hwaddr; fwip = ((struct fwip_softc *)device_get_softc(dev)); unit = device_get_unit(dev); ifp = fwip->fw_softc.fwip_ifp = if_alloc(IFT_IEEE1394); if (ifp == NULL) return (ENOSPC); mtx_init(&fwip->mtx, "fwip", NULL, MTX_DEF); /* XXX */ fwip->dma_ch = -1; fwip->fd.fc = device_get_ivars(dev); if (tx_speed < 0) tx_speed = fwip->fd.fc->speed; fwip->fd.dev = dev; fwip->fd.post_explore = NULL; fwip->fd.post_busreset = fwip_post_busreset; fwip->fw_softc.fwip = fwip; TASK_INIT(&fwip->start_send, 0, fwip_start_send, fwip); /* * Encode our hardware the way that arp likes it. */ hwaddr = &IFP2FWC(fwip->fw_softc.fwip_ifp)->fc_hwaddr; hwaddr->sender_unique_ID_hi = htonl(fwip->fd.fc->eui.hi); hwaddr->sender_unique_ID_lo = htonl(fwip->fd.fc->eui.lo); hwaddr->sender_max_rec = fwip->fd.fc->maxrec; hwaddr->sspd = fwip->fd.fc->speed; hwaddr->sender_unicast_FIFO_hi = htons((uint16_t)(INET_FIFO >> 32)); hwaddr->sender_unicast_FIFO_lo = htonl((uint32_t)INET_FIFO); /* fill the rest and attach interface */ if_setsoftc(ifp, &fwip->fw_softc); if_initname(ifp, device_get_name(dev), unit); if_setinitfn(ifp, fwip_init); if_setstartfn(ifp, fwip_start); if_setioctlfn(ifp, fwip_ioctl); if_setflags(ifp, (IFF_BROADCAST|IFF_SIMPLEX|IFF_MULTICAST)); if_setsendqlen(ifp, TX_MAX_QUEUE); #ifdef DEVICE_POLLING if_setcapabilitiesbit(ifp, IFCAP_POLLING, 0); #endif s = splimp(); firewire_ifattach(ifp, hwaddr); splx(s); FWIPDEBUG(ifp, "interface created\n"); return 0; } static void fwip_stop(struct fwip_softc *fwip) { struct firewire_comm *fc; struct fw_xferq *xferq; if_t ifp = fwip->fw_softc.fwip_ifp; struct fw_xfer *xfer, *next; int i; fc = fwip->fd.fc; if (fwip->dma_ch >= 0) { xferq = fc->ir[fwip->dma_ch]; if (xferq->flag & FWXFERQ_RUNNING) fc->irx_disable(fc, fwip->dma_ch); xferq->flag &= ~(FWXFERQ_MODEMASK | FWXFERQ_OPEN | FWXFERQ_STREAM | FWXFERQ_EXTBUF | FWXFERQ_HANDLER | FWXFERQ_CHTAGMASK); xferq->hand = NULL; for (i = 0; i < xferq->bnchunk; i++) m_freem(xferq->bulkxfer[i].mbuf); free(xferq->bulkxfer, M_FWIP); fw_bindremove(fc, &fwip->fwb); for (xfer = STAILQ_FIRST(&fwip->fwb.xferlist); xfer != NULL; xfer = next) { next = STAILQ_NEXT(xfer, link); fw_xfer_free(xfer); } for (xfer = STAILQ_FIRST(&fwip->xferlist); xfer != NULL; xfer = next) { next = STAILQ_NEXT(xfer, link); fw_xfer_free(xfer); } STAILQ_INIT(&fwip->xferlist); xferq->bulkxfer = NULL; fwip->dma_ch = -1; } if_setdrvflagbits(ifp, 0, (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)); } static int fwip_detach(device_t dev) { struct fwip_softc *fwip; if_t ifp; int s; fwip = (struct fwip_softc *)device_get_softc(dev); ifp = fwip->fw_softc.fwip_ifp; #ifdef DEVICE_POLLING if (if_getcapenable(ifp) & IFCAP_POLLING) ether_poll_deregister(ifp); #endif s = splimp(); fwip_stop(fwip); firewire_ifdetach(ifp); if_free(ifp); mtx_destroy(&fwip->mtx); splx(s); return 0; } static void fwip_init(void *arg) { struct fwip_softc *fwip = ((struct fwip_eth_softc *)arg)->fwip; struct firewire_comm *fc; if_t ifp = fwip->fw_softc.fwip_ifp; struct fw_xferq *xferq; struct fw_xfer *xfer; struct mbuf *m; int i; FWIPDEBUG(ifp, "initializing\n"); fc = fwip->fd.fc; #define START 0 if (fwip->dma_ch < 0) { fwip->dma_ch = fw_open_isodma(fc, /* tx */0); if (fwip->dma_ch < 0) return; xferq = fc->ir[fwip->dma_ch]; xferq->flag |= FWXFERQ_EXTBUF | FWXFERQ_HANDLER | FWXFERQ_STREAM; xferq->flag &= ~0xff; xferq->flag |= broadcast_channel & 0xff; /* register fwip_input handler */ xferq->sc = (caddr_t) fwip; xferq->hand = fwip_stream_input; xferq->bnchunk = rx_queue_len; xferq->bnpacket = 1; xferq->psize = MCLBYTES; xferq->queued = 0; xferq->buf = NULL; xferq->bulkxfer = (struct fw_bulkxfer *) malloc( sizeof(struct fw_bulkxfer) * xferq->bnchunk, M_FWIP, M_WAITOK); if (xferq->bulkxfer == NULL) { printf("if_fwip: malloc failed\n"); return; } STAILQ_INIT(&xferq->stvalid); STAILQ_INIT(&xferq->stfree); STAILQ_INIT(&xferq->stdma); xferq->stproc = NULL; for (i = 0; i < xferq->bnchunk; i++) { m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR); xferq->bulkxfer[i].mbuf = m; m->m_len = m->m_pkthdr.len = m->m_ext.ext_size; STAILQ_INSERT_TAIL(&xferq->stfree, &xferq->bulkxfer[i], link); } fwip->fwb.start = INET_FIFO; fwip->fwb.end = INET_FIFO + 16384; /* S3200 packet size */ /* pre-allocate xfer */ STAILQ_INIT(&fwip->fwb.xferlist); for (i = 0; i < rx_queue_len; i++) { xfer = fw_xfer_alloc(M_FWIP); if (xfer == NULL) break; m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR); xfer->recv.payload = mtod(m, uint32_t *); xfer->recv.pay_len = MCLBYTES; xfer->hand = fwip_unicast_input; xfer->fc = fc; xfer->sc = (caddr_t)fwip; xfer->mbuf = m; STAILQ_INSERT_TAIL(&fwip->fwb.xferlist, xfer, link); } fw_bindadd(fc, &fwip->fwb); STAILQ_INIT(&fwip->xferlist); for (i = 0; i < TX_MAX_QUEUE; i++) { xfer = fw_xfer_alloc(M_FWIP); if (xfer == NULL) break; xfer->send.spd = tx_speed; xfer->fc = fwip->fd.fc; xfer->sc = (caddr_t)fwip; xfer->hand = fwip_output_callback; STAILQ_INSERT_TAIL(&fwip->xferlist, xfer, link); } } else xferq = fc->ir[fwip->dma_ch]; fwip->last_dest.hi = 0; fwip->last_dest.lo = 0; /* start dma */ if ((xferq->flag & FWXFERQ_RUNNING) == 0) fc->irx_enable(fc, fwip->dma_ch); if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); #if 0 /* attempt to start output */ fwip_start(ifp); #endif } static int fwip_ioctl(if_t ifp, u_long cmd, caddr_t data) { struct fwip_softc *fwip = ((struct fwip_eth_softc *)if_getsoftc(ifp))->fwip; int s, error; switch (cmd) { case SIOCSIFFLAGS: s = splimp(); if (if_getflags(ifp) & IFF_UP) { if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) fwip_init(&fwip->fw_softc); } else { if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) fwip_stop(fwip); } splx(s); break; case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCSIFCAP: #ifdef DEVICE_POLLING { struct ifreq *ifr = (struct ifreq *) data; struct firewire_comm *fc = fwip->fd.fc; if (ifr->ifr_reqcap & IFCAP_POLLING && !(if_getcapenable(ifp) & IFCAP_POLLING)) { error = ether_poll_register(fwip_poll, ifp); if (error) return (error); /* Disable interrupts */ fc->set_intr(fc, 0); if_setcapenablebit(ifp, IFCAP_POLLING, 0); return (error); } if (!(ifr->ifr_reqcap & IFCAP_POLLING) && if_getcapenable(ifp) & IFCAP_POLLING) { error = ether_poll_deregister(ifp); /* Enable interrupts. */ fc->set_intr(fc, 1); if_setcapenablebit(ifp, 0, IFCAP_POLLING); return (error); } } #endif /* DEVICE_POLLING */ break; default: s = splimp(); error = firewire_ioctl(ifp, cmd, data); splx(s); return (error); } return (0); } static void fwip_post_busreset(void *arg) { struct fwip_softc *fwip = arg; struct crom_src *src; struct crom_chunk *root; src = fwip->fd.fc->crom_src; root = fwip->fd.fc->crom_root; /* RFC2734 IPv4 over IEEE1394 */ bzero(&fwip->unit4, sizeof(struct crom_chunk)); crom_add_chunk(src, root, &fwip->unit4, CROM_UDIR); crom_add_entry(&fwip->unit4, CSRKEY_SPEC, CSRVAL_IETF); crom_add_simple_text(src, &fwip->unit4, &fwip->spec4, "IANA"); crom_add_entry(&fwip->unit4, CSRKEY_VER, 1); crom_add_simple_text(src, &fwip->unit4, &fwip->ver4, "IPv4"); /* RFC3146 IPv6 over IEEE1394 */ bzero(&fwip->unit6, sizeof(struct crom_chunk)); crom_add_chunk(src, root, &fwip->unit6, CROM_UDIR); crom_add_entry(&fwip->unit6, CSRKEY_SPEC, CSRVAL_IETF); crom_add_simple_text(src, &fwip->unit6, &fwip->spec6, "IANA"); crom_add_entry(&fwip->unit6, CSRKEY_VER, 2); crom_add_simple_text(src, &fwip->unit6, &fwip->ver6, "IPv6"); fwip->last_dest.hi = 0; fwip->last_dest.lo = 0; firewire_busreset(fwip->fw_softc.fwip_ifp); } static void fwip_output_callback(struct fw_xfer *xfer) { struct fwip_softc *fwip; if_t ifp; int s; fwip = (struct fwip_softc *)xfer->sc; ifp = fwip->fw_softc.fwip_ifp; /* XXX error check */ FWIPDEBUG(ifp, "resp = %d\n", xfer->resp); if (xfer->resp != 0) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(xfer->mbuf); fw_xfer_unload(xfer); s = splimp(); FWIP_LOCK(fwip); STAILQ_INSERT_TAIL(&fwip->xferlist, xfer, link); FWIP_UNLOCK(fwip); splx(s); /* for queue full */ if (!if_sendq_empty(ifp)) { fwip_start(ifp); } } static void fwip_start(if_t ifp) { struct fwip_softc *fwip = ((struct fwip_eth_softc *)if_getsoftc(ifp))->fwip; int s; FWIPDEBUG(ifp, "starting\n"); if (fwip->dma_ch < 0) { struct mbuf *m = NULL; FWIPDEBUG(ifp, "not ready\n"); s = splimp(); do { m = if_dequeue(ifp); if (m != NULL) m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } while (m != NULL); splx(s); return; } s = splimp(); if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); if (!if_sendq_empty(ifp)) fwip_async_output(fwip, ifp); if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); splx(s); } /* Async. stream output */ static void fwip_async_output(struct fwip_softc *fwip, if_t ifp) { struct firewire_comm *fc = fwip->fd.fc; struct mbuf *m; struct m_tag *mtag; struct fw_hwaddr *destfw; struct fw_xfer *xfer; struct fw_xferq *xferq; struct fw_pkt *fp; uint16_t nodeid; int error; int i = 0; xfer = NULL; xferq = fc->atq; while ((xferq->queued < xferq->maxq - 1) && !if_sendq_empty(ifp)) { FWIP_LOCK(fwip); xfer = STAILQ_FIRST(&fwip->xferlist); if (xfer == NULL) { FWIP_UNLOCK(fwip); #if 0 printf("if_fwip: lack of xfer\n"); #endif break; } STAILQ_REMOVE_HEAD(&fwip->xferlist, link); FWIP_UNLOCK(fwip); m = if_dequeue(ifp); if (m == NULL) { FWIP_LOCK(fwip); STAILQ_INSERT_HEAD(&fwip->xferlist, xfer, link); FWIP_UNLOCK(fwip); break; } /* * Dig out the link-level address which * firewire_output got via arp or neighbour * discovery. If we don't have a link-level address, * just stick the thing on the broadcast channel. */ mtag = m_tag_locate(m, MTAG_FIREWIRE, MTAG_FIREWIRE_HWADDR, 0); if (mtag == NULL) destfw = NULL; else destfw = (struct fw_hwaddr *) (mtag + 1); /* * We don't do any bpf stuff here - the generic code * in firewire_output gives the packet to bpf before * it adds the link-level encapsulation. */ /* * Put the mbuf in the xfer early in case we hit an * error case below - fwip_output_callback will free * the mbuf. */ xfer->mbuf = m; /* * We use the arp result (if any) to add a suitable firewire * packet header before handing off to the bus. */ fp = &xfer->send.hdr; nodeid = FWLOCALBUS | fc->nodeid; if ((m->m_flags & M_BCAST) || !destfw) { /* * Broadcast packets are sent as GASP packets with * specifier ID 0x00005e, version 1 on the broadcast * channel. To be conservative, we send at the * slowest possible speed. */ uint32_t *p; M_PREPEND(m, 2*sizeof(uint32_t), M_NOWAIT); p = mtod(m, uint32_t *); fp->mode.stream.len = m->m_pkthdr.len; fp->mode.stream.chtag = broadcast_channel; fp->mode.stream.tcode = FWTCODE_STREAM; fp->mode.stream.sy = 0; xfer->send.spd = 0; p[0] = htonl(nodeid << 16); p[1] = htonl((0x5e << 24) | 1); } else { /* * Unicast packets are sent as block writes to the * target's unicast fifo address. If we can't * find the node address, we just give up. We * could broadcast it but that might overflow * the packet size limitations due to the * extra GASP header. Note: the hardware * address is stored in network byte order to * make life easier for ARP. */ struct fw_device *fd; struct fw_eui64 eui; eui.hi = ntohl(destfw->sender_unique_ID_hi); eui.lo = ntohl(destfw->sender_unique_ID_lo); if (fwip->last_dest.hi != eui.hi || fwip->last_dest.lo != eui.lo) { fd = fw_noderesolve_eui64(fc, &eui); if (!fd) { /* error */ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); /* XXX set error code */ fwip_output_callback(xfer); continue; } fwip->last_hdr.mode.wreqb.dst = FWLOCALBUS | fd->dst; fwip->last_hdr.mode.wreqb.tlrt = 0; fwip->last_hdr.mode.wreqb.tcode = FWTCODE_WREQB; fwip->last_hdr.mode.wreqb.pri = 0; fwip->last_hdr.mode.wreqb.src = nodeid; fwip->last_hdr.mode.wreqb.dest_hi = ntohs(destfw->sender_unicast_FIFO_hi); fwip->last_hdr.mode.wreqb.dest_lo = ntohl(destfw->sender_unicast_FIFO_lo); fwip->last_hdr.mode.wreqb.extcode = 0; fwip->last_dest = eui; } fp->mode.wreqb = fwip->last_hdr.mode.wreqb; fp->mode.wreqb.len = m->m_pkthdr.len; xfer->send.spd = min(destfw->sspd, fc->speed); } xfer->send.pay_len = m->m_pkthdr.len; error = fw_asyreq(fc, -1, xfer); if (error == EAGAIN) { /* * We ran out of tlabels - requeue the packet * for later transmission. */ xfer->mbuf = 0; FWIP_LOCK(fwip); STAILQ_INSERT_TAIL(&fwip->xferlist, xfer, link); FWIP_UNLOCK(fwip); if_sendq_prepend(ifp, m); break; } if (error) { /* error */ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); /* XXX set error code */ fwip_output_callback(xfer); continue; } else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); i++; } } #if 0 if (i > 1) printf("%d queued\n", i); #endif if (i > 0) xferq->start(fc); } static void fwip_start_send (void *arg, int count) { struct fwip_softc *fwip = arg; fwip->fd.fc->atq->start(fwip->fd.fc); } /* Async. stream output */ static void fwip_stream_input(struct fw_xferq *xferq) { struct epoch_tracker et; struct mbuf *m, *m0; struct m_tag *mtag; if_t ifp; struct fwip_softc *fwip; struct fw_bulkxfer *sxfer; struct fw_pkt *fp; uint16_t src; uint32_t *p; fwip = (struct fwip_softc *)xferq->sc; ifp = fwip->fw_softc.fwip_ifp; NET_EPOCH_ENTER(et); while ((sxfer = STAILQ_FIRST(&xferq->stvalid)) != NULL) { STAILQ_REMOVE_HEAD(&xferq->stvalid, link); fp = mtod(sxfer->mbuf, struct fw_pkt *); if (fwip->fd.fc->irx_post != NULL) fwip->fd.fc->irx_post(fwip->fd.fc, fp->mode.ld); m = sxfer->mbuf; /* insert new rbuf */ sxfer->mbuf = m0 = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m0 != NULL) { m0->m_len = m0->m_pkthdr.len = m0->m_ext.ext_size; STAILQ_INSERT_TAIL(&xferq->stfree, sxfer, link); } else printf("fwip_as_input: m_getcl failed\n"); /* * We must have a GASP header - leave the * encapsulation sanity checks to the generic * code. Remember that we also have the firewire async * stream header even though that isn't accounted for * in mode.stream.len. */ if (sxfer->resp != 0 || fp->mode.stream.len < 2*sizeof(uint32_t)) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); continue; } m->m_len = m->m_pkthdr.len = fp->mode.stream.len + sizeof(fp->mode.stream); /* * If we received the packet on the broadcast channel, * mark it as broadcast, otherwise we assume it must * be multicast. */ if (fp->mode.stream.chtag == broadcast_channel) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; /* * Make sure we recognise the GASP specifier and * version. */ p = mtod(m, uint32_t *); if ((((ntohl(p[1]) & 0xffff) << 8) | ntohl(p[2]) >> 24) != 0x00005e || (ntohl(p[2]) & 0xffffff) != 1) { FWIPDEBUG(ifp, "Unrecognised GASP header %#08x %#08x\n", ntohl(p[1]), ntohl(p[2])); m_freem(m); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); continue; } /* * Record the sender ID for possible BPF usage. */ src = ntohl(p[1]) >> 16; - if (bpf_peers_present(if_getbpf(ifp))) { + if (bpf_peers_present_if(ifp)) { mtag = m_tag_alloc(MTAG_FIREWIRE, MTAG_FIREWIRE_SENDER_EUID, 2*sizeof(uint32_t), M_NOWAIT); if (mtag) { /* bpf wants it in network byte order */ struct fw_device *fd; uint32_t *p = (uint32_t *) (mtag + 1); fd = fw_noderesolve_nodeid(fwip->fd.fc, src & 0x3f); if (fd) { p[0] = htonl(fd->eui.hi); p[1] = htonl(fd->eui.lo); } else { p[0] = 0; p[1] = 0; } m_tag_prepend(m, mtag); } } /* * Trim off the GASP header */ m_adj(m, 3*sizeof(uint32_t)); m->m_pkthdr.rcvif = ifp; firewire_input(ifp, m, src); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); } NET_EPOCH_EXIT(et); if (STAILQ_FIRST(&xferq->stfree) != NULL) fwip->fd.fc->irx_enable(fwip->fd.fc, fwip->dma_ch); } static __inline void fwip_unicast_input_recycle(struct fwip_softc *fwip, struct fw_xfer *xfer) { struct mbuf *m; /* * We have finished with a unicast xfer. Allocate a new * cluster and stick it on the back of the input queue. */ m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR); xfer->mbuf = m; xfer->recv.payload = mtod(m, uint32_t *); xfer->recv.pay_len = MCLBYTES; xfer->mbuf = m; STAILQ_INSERT_TAIL(&fwip->fwb.xferlist, xfer, link); } static void fwip_unicast_input(struct fw_xfer *xfer) { uint64_t address; struct mbuf *m; struct m_tag *mtag; struct epoch_tracker et; if_t ifp; struct fwip_softc *fwip; struct fw_pkt *fp; //struct fw_pkt *sfp; int rtcode; fwip = (struct fwip_softc *)xfer->sc; ifp = fwip->fw_softc.fwip_ifp; m = xfer->mbuf; xfer->mbuf = 0; fp = &xfer->recv.hdr; /* * Check the fifo address - we only accept addresses of * exactly INET_FIFO. */ address = ((uint64_t)fp->mode.wreqb.dest_hi << 32) | fp->mode.wreqb.dest_lo; if (fp->mode.wreqb.tcode != FWTCODE_WREQB) { rtcode = FWRCODE_ER_TYPE; } else if (address != INET_FIFO) { rtcode = FWRCODE_ER_ADDR; } else { rtcode = FWRCODE_COMPLETE; } NET_EPOCH_ENTER(et); /* * Pick up a new mbuf and stick it on the back of the receive * queue. */ fwip_unicast_input_recycle(fwip, xfer); /* * If we've already rejected the packet, give up now. */ if (rtcode != FWRCODE_COMPLETE) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto done; } - if (bpf_peers_present(if_getbpf(ifp))) { + if (bpf_peers_present_if(ifp)) { /* * Record the sender ID for possible BPF usage. */ mtag = m_tag_alloc(MTAG_FIREWIRE, MTAG_FIREWIRE_SENDER_EUID, 2*sizeof(uint32_t), M_NOWAIT); if (mtag) { /* bpf wants it in network byte order */ struct fw_device *fd; uint32_t *p = (uint32_t *) (mtag + 1); fd = fw_noderesolve_nodeid(fwip->fd.fc, fp->mode.wreqb.src & 0x3f); if (fd) { p[0] = htonl(fd->eui.hi); p[1] = htonl(fd->eui.lo); } else { p[0] = 0; p[1] = 0; } m_tag_prepend(m, mtag); } } /* * Hand off to the generic encapsulation code. We don't use * ifp->if_input so that we can pass the source nodeid as an * argument to facilitate link-level fragment reassembly. */ m->m_len = m->m_pkthdr.len = fp->mode.wreqb.len; m->m_pkthdr.rcvif = ifp; firewire_input(ifp, m, fp->mode.wreqb.src); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); done: NET_EPOCH_EXIT(et); } static device_method_t fwip_methods[] = { /* device interface */ DEVMETHOD(device_identify, fwip_identify), DEVMETHOD(device_probe, fwip_probe), DEVMETHOD(device_attach, fwip_attach), DEVMETHOD(device_detach, fwip_detach), { 0, 0 } }; static driver_t fwip_driver = { "fwip", fwip_methods, sizeof(struct fwip_softc), }; DRIVER_MODULE(fwip, firewire, fwip_driver, 0, 0); MODULE_VERSION(fwip, 1); MODULE_DEPEND(fwip, firewire, 1, 1, 1); diff --git a/sys/dev/hyperv/netvsc/if_hn.c b/sys/dev/hyperv/netvsc/if_hn.c index 7d8e1914163e..f6f885873a79 100644 --- a/sys/dev/hyperv/netvsc/if_hn.c +++ b/sys/dev/hyperv/netvsc/if_hn.c @@ -1,7680 +1,7680 @@ /*- * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2004-2006 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_hn.h" #include "opt_inet6.h" #include "opt_inet.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RSS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmbus_if.h" #define HN_IFSTART_SUPPORT #define HN_RING_CNT_DEF_MAX 8 #define HN_VFMAP_SIZE_DEF 8 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ /* YYY should get it from the underlying channel */ #define HN_TX_DESC_CNT 512 #define HN_RNDIS_PKT_LEN \ (sizeof(struct rndis_packet_msg) + \ HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE #define HN_TX_DATA_BOUNDARY PAGE_SIZE #define HN_TX_DATA_MAXSIZE IP_MAXPACKET #define HN_TX_DATA_SEGSIZE PAGE_SIZE /* -1 for RNDIS packet message */ #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) #define HN_DIRECT_TX_SIZE_DEF 128 #define HN_EARLY_TXEOF_THRESH 8 #define HN_PKTBUF_LEN_DEF (16 * 1024) #define HN_LROENT_CNT_DEF 128 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) /* YYY 2*MTU is a bit rough, but should be good enough. */ #define HN_LRO_LENLIM_MIN(ifp) (2 * if_getmtu(ifp)) #define HN_LRO_ACKCNT_DEF 1 #define HN_LOCK_INIT(sc) \ sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) #define HN_LOCK(sc) \ do { \ while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ /* Relinquish cpu to avoid deadlock */ \ sched_relinquish(curthread); \ DELAY(1000); \ } \ } while (0) #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) #define HN_CSUM_IP_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) #define HN_CSUM_IP6_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) #define HN_PKTSIZE_MIN(align) \ roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ HN_RNDIS_PKT_LEN, (align)) #define HN_PKTSIZE(m, align) \ roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) #ifdef RSS #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) #else #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) #endif struct hn_txdesc { #ifndef HN_USE_TXDESC_BUFRING SLIST_ENTRY(hn_txdesc) link; #endif STAILQ_ENTRY(hn_txdesc) agg_link; /* Aggregated txdescs, in sending order. */ STAILQ_HEAD(, hn_txdesc) agg_list; /* The oldest packet, if transmission aggregation happens. */ struct mbuf *m; struct hn_tx_ring *txr; int refs; uint32_t flags; /* HN_TXD_FLAG_ */ struct hn_nvs_sendctx send_ctx; uint32_t chim_index; int chim_size; bus_dmamap_t data_dmap; bus_addr_t rndis_pkt_paddr; struct rndis_packet_msg *rndis_pkt; bus_dmamap_t rndis_pkt_dmap; }; #define HN_TXD_FLAG_ONLIST 0x0001 #define HN_TXD_FLAG_DMAMAP 0x0002 #define HN_TXD_FLAG_ONAGG 0x0004 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 struct packet_info_id { uint8_t ver; uint8_t flag; uint16_t pkt_id; }; #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) struct hn_rxinfo { const uint32_t *vlan_info; const uint32_t *csum_info; const uint32_t *hash_info; const uint32_t *hash_value; const struct packet_info_id *pktinfo_id; }; struct hn_rxvf_setarg { struct hn_rx_ring *rxr; if_t vf_ifp; }; #define HN_RXINFO_VLAN 0x0001 #define HN_RXINFO_CSUM 0x0002 #define HN_RXINFO_HASHINF 0x0004 #define HN_RXINFO_HASHVAL 0x0008 #define HN_RXINFO_PKTINFO_ID 0x0010 #define HN_RXINFO_ALL \ (HN_RXINFO_VLAN | \ HN_RXINFO_CSUM | \ HN_RXINFO_HASHINF | \ HN_RXINFO_HASHVAL | \ HN_RXINFO_PKTINFO_ID) static int hn_probe(device_t); static int hn_attach(device_t); static int hn_detach(device_t); static int hn_shutdown(device_t); static void hn_chan_callback(struct vmbus_channel *, void *); static void hn_init(void *); static int hn_ioctl(if_t, u_long, caddr_t); #ifdef HN_IFSTART_SUPPORT static void hn_start(if_t); #endif static int hn_transmit(if_t, struct mbuf *); static void hn_xmit_qflush(if_t); static int hn_ifmedia_upd(if_t); static void hn_ifmedia_sts(if_t, struct ifmediareq *); static void hn_ifnet_event(void *, if_t, int); static void hn_ifaddr_event(void *, if_t); static void hn_ifnet_attevent(void *, if_t); static void hn_ifnet_detevent(void *, if_t); static void hn_ifnet_lnkevent(void *, if_t, int); static bool hn_ismyvf(const struct hn_softc *, const if_t); static void hn_rxvf_change(struct hn_softc *, if_t, bool); static void hn_rxvf_set(struct hn_softc *, if_t); static void hn_rxvf_set_task(void *, int); static void hn_xpnt_vf_input(if_t, struct mbuf *); static int hn_xpnt_vf_iocsetflags(struct hn_softc *); static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, struct ifreq *); static void hn_xpnt_vf_saveifflags(struct hn_softc *); static bool hn_xpnt_vf_isready(struct hn_softc *); static void hn_xpnt_vf_setready(struct hn_softc *); static void hn_xpnt_vf_init_taskfunc(void *, int); static void hn_xpnt_vf_init(struct hn_softc *); static void hn_xpnt_vf_setenable(struct hn_softc *); static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); static void hn_vf_rss_fixup(struct hn_softc *, bool); static void hn_vf_rss_restore(struct hn_softc *); static int hn_rndis_rxinfo(const void *, int, struct hn_rxinfo *); static void hn_rndis_rx_data(struct hn_rx_ring *, const void *, int); static void hn_rndis_rx_status(struct hn_softc *, const void *, int); static void hn_rndis_init_fixat(struct hn_softc *, int); static void hn_nvs_handle_notify(struct hn_softc *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_handle_comp(struct hn_softc *, struct vmbus_channel *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, struct vmbus_channel *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, struct vmbus_channel *, uint64_t); static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); #ifndef RSS static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); static void hn_stop(struct hn_softc *, bool); static void hn_init_locked(struct hn_softc *); static int hn_chan_attach(struct hn_softc *, struct vmbus_channel *); static void hn_chan_detach(struct hn_softc *, struct vmbus_channel *); static int hn_attach_subchans(struct hn_softc *); static void hn_detach_allchans(struct hn_softc *); static void hn_chan_rollup(struct hn_rx_ring *, struct hn_tx_ring *); static void hn_set_ring_inuse(struct hn_softc *, int); static int hn_synth_attach(struct hn_softc *, int); static void hn_synth_detach(struct hn_softc *); static int hn_synth_alloc_subchans(struct hn_softc *, int *); static bool hn_synth_attachable(const struct hn_softc *); static void hn_suspend(struct hn_softc *); static void hn_suspend_data(struct hn_softc *); static void hn_suspend_mgmt(struct hn_softc *); static void hn_resume(struct hn_softc *); static void hn_resume_data(struct hn_softc *); static void hn_resume_mgmt(struct hn_softc *); static void hn_suspend_mgmt_taskfunc(void *, int); static void hn_chan_drain(struct hn_softc *, struct vmbus_channel *); static void hn_disable_rx(struct hn_softc *); static void hn_drain_rxtx(struct hn_softc *, int); static void hn_polling(struct hn_softc *, u_int); static void hn_chan_polling(struct vmbus_channel *, u_int); static void hn_mtu_change_fixup(struct hn_softc *); static void hn_update_link_status(struct hn_softc *); static void hn_change_network(struct hn_softc *); static void hn_link_taskfunc(void *, int); static void hn_netchg_init_taskfunc(void *, int); static void hn_netchg_status_taskfunc(void *, int); static void hn_link_status(struct hn_softc *); static int hn_create_rx_data(struct hn_softc *, int); static void hn_destroy_rx_data(struct hn_softc *); static int hn_check_iplen(const struct mbuf *, int); static void hn_rxpkt_proto(const struct mbuf *, int *, int *); static int hn_set_rxfilter(struct hn_softc *, uint32_t); static int hn_rxfilter_config(struct hn_softc *); static int hn_rss_reconfig(struct hn_softc *); static void hn_rss_ind_fixup(struct hn_softc *); static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); static int hn_rxpkt(struct hn_rx_ring *); static uint32_t hn_rss_type_fromndis(uint32_t); static uint32_t hn_rss_type_tondis(uint32_t); static int hn_tx_ring_create(struct hn_softc *, int); static void hn_tx_ring_destroy(struct hn_tx_ring *); static int hn_create_tx_data(struct hn_softc *, int); static void hn_fixup_tx_data(struct hn_softc *); static void hn_fixup_rx_data(struct hn_softc *); static void hn_destroy_tx_data(struct hn_softc *); static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); static void hn_txdesc_gc(struct hn_tx_ring *, struct hn_txdesc *); static int hn_encap(if_t, struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **); static int hn_txpkt(if_t, struct hn_tx_ring *, struct hn_txdesc *); static void hn_set_chim_size(struct hn_softc *, int); static void hn_set_tso_maxsize(struct hn_softc *, int, int); static bool hn_tx_ring_pending(struct hn_tx_ring *); static void hn_tx_ring_qflush(struct hn_tx_ring *); static void hn_resume_tx(struct hn_softc *, int); static void hn_set_txagg(struct hn_softc *); static void *hn_try_txagg(if_t, struct hn_tx_ring *, struct hn_txdesc *, int); static int hn_get_txswq_depth(const struct hn_tx_ring *); static void hn_txpkt_done(struct hn_nvs_sendctx *, struct hn_softc *, struct vmbus_channel *, const void *, int); static int hn_txpkt_sglist(struct hn_tx_ring *, struct hn_txdesc *); static int hn_txpkt_chim(struct hn_tx_ring *, struct hn_txdesc *); static int hn_xmit(struct hn_tx_ring *, int); static void hn_xmit_taskfunc(void *, int); static void hn_xmit_txeof(struct hn_tx_ring *); static void hn_xmit_txeof_taskfunc(void *, int); #ifdef HN_IFSTART_SUPPORT static int hn_start_locked(struct hn_tx_ring *, int); static void hn_start_taskfunc(void *, int); static void hn_start_txeof(struct hn_tx_ring *); static void hn_start_txeof_taskfunc(void *, int); #endif static int hn_rsc_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Hyper-V network interface"); /* Trust tcp segment verification on host side. */ static int hn_trust_hosttcp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, &hn_trust_hosttcp, 0, "Trust tcp segment verification on host side, " "when csum info is missing (global setting)"); /* Trust udp datagrams verification on host side. */ static int hn_trust_hostudp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, &hn_trust_hostudp, 0, "Trust udp datagram verification on host side, " "when csum info is missing (global setting)"); /* Trust ip packets verification on host side. */ static int hn_trust_hostip = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, &hn_trust_hostip, 0, "Trust ip packet verification on host side, " "when csum info is missing (global setting)"); /* * Offload UDP/IPv4 checksum. */ static int hn_enable_udp4cs = 1; SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); /* * Offload UDP/IPv6 checksum. */ static int hn_enable_udp6cs = 1; SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); /* Stats. */ static counter_u64_t hn_udpcs_fixup; SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, &hn_udpcs_fixup, "# of UDP checksum fixup"); /* * See hn_set_hlen(). * * This value is for Azure. For Hyper-V, set this above * 65536 to disable UDP datagram checksum fixup. */ static int hn_udpcs_fixup_mtu = 1420; SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); /* Limit TSO burst size */ static int hn_tso_maxlen = IP_MAXPACKET; SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, &hn_tso_maxlen, 0, "TSO burst limit"); /* Limit chimney send size */ static int hn_tx_chimney_size = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, &hn_tx_chimney_size, 0, "Chimney send packet size limit"); /* Limit the size of packet for direct transmission */ static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); /* # of LRO entries per RX ring */ #if defined(INET) || defined(INET6) static int hn_lro_entry_count = HN_LROENT_CNT_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, &hn_lro_entry_count, 0, "LRO entry count"); #endif static int hn_tx_taskq_cnt = 1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); #define HN_TX_TASKQ_M_INDEP 0 #define HN_TX_TASKQ_M_GLOBAL 1 #define HN_TX_TASKQ_M_EVTTQ 2 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, &hn_tx_taskq_mode, 0, "TX taskqueue modes: " "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); #ifndef HN_USE_TXDESC_BUFRING static int hn_use_txdesc_bufring = 0; #else static int hn_use_txdesc_bufring = 1; #endif SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); #ifdef HN_IFSTART_SUPPORT /* Use ifnet.if_start instead of ifnet.if_transmit */ static int hn_use_if_start = 0; SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, &hn_use_if_start, 0, "Use if_start TX method"); #endif /* # of channels to use */ static int hn_chan_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, &hn_chan_cnt, 0, "# of channels to use; each channel has one RX ring and one TX ring"); /* # of transmit rings to use */ static int hn_tx_ring_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, &hn_tx_ring_cnt, 0, "# of TX rings to use"); /* Software TX ring deptch */ static int hn_tx_swq_depth = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ static u_int hn_lro_mbufq_depth = 0; SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); /* Packet transmission aggregation size limit */ static int hn_tx_agg_size = -1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); /* Packet transmission aggregation count limit */ static int hn_tx_agg_pkts = -1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); /* VF list */ SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, hn_vflist_sysctl, "A", "VF list"); /* VF mapping */ SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, hn_vfmap_sysctl, "A", "VF mapping"); /* Transparent VF */ static int hn_xpnt_vf = 1; SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, &hn_xpnt_vf, 0, "Transparent VF mod"); /* Accurate BPF support for Transparent VF */ static int hn_xpnt_vf_accbpf = 0; SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); /* Extra wait for transparent VF attach routing; unit seconds. */ static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, &hn_xpnt_vf_attwait, 0, "Extra wait for transparent VF attach routing; unit: seconds"); static u_int hn_cpu_index; /* next CPU for channel */ static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ static struct rmlock hn_vfmap_lock; static int hn_vfmap_size; static if_t *hn_vfmap; #ifndef RSS static const uint8_t hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa }; #endif /* !RSS */ static const struct hyperv_guid hn_guid = { .hv_guid = { 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } }; static device_method_t hn_methods[] = { /* Device interface */ DEVMETHOD(device_probe, hn_probe), DEVMETHOD(device_attach, hn_attach), DEVMETHOD(device_detach, hn_detach), DEVMETHOD(device_shutdown, hn_shutdown), DEVMETHOD_END }; static driver_t hn_driver = { "hn", hn_methods, sizeof(struct hn_softc) }; DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0); MODULE_VERSION(hn, 1); MODULE_DEPEND(hn, vmbus, 1, 1, 1); static void hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) { int i; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; } static int hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && txd->chim_size == 0, ("invalid rndis sglist txd")); return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); } static int hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) { struct hn_nvs_rndis rndis; KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && txd->chim_size > 0, ("invalid rndis chim txd")); rndis.nvs_type = HN_NVS_TYPE_RNDIS; rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; rndis.nvs_chim_idx = txd->chim_index; rndis.nvs_chim_sz = txd->chim_size; return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, &rndis, sizeof(rndis), &txd->send_ctx)); } static __inline uint32_t hn_chim_alloc(struct hn_softc *sc) { int i, bmap_cnt = sc->hn_chim_bmap_cnt; u_long *bmap = sc->hn_chim_bmap; uint32_t ret = HN_NVS_CHIM_IDX_INVALID; for (i = 0; i < bmap_cnt; ++i) { int idx; idx = ffsl(~bmap[i]); if (idx == 0) continue; --idx; /* ffsl is 1-based */ KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, ("invalid i %d and idx %d", i, idx)); if (atomic_testandset_long(&bmap[i], idx)) continue; ret = i * LONG_BIT + idx; break; } return (ret); } static __inline void hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) { u_long mask; uint32_t idx; idx = chim_idx / LONG_BIT; KASSERT(idx < sc->hn_chim_bmap_cnt, ("invalid chimney index 0x%x", chim_idx)); mask = 1UL << (chim_idx % LONG_BIT); KASSERT(sc->hn_chim_bmap[idx] & mask, ("index bitmap 0x%lx, chimney index %u, " "bitmap idx %d, bitmask 0x%lx", sc->hn_chim_bmap[idx], chim_idx, idx, mask)); atomic_clear_long(&sc->hn_chim_bmap[idx], mask); } #if defined(INET6) || defined(INET) #define PULLUP_HDR(m, len) \ do { \ if (__predict_false((m)->m_len < (len))) { \ (m) = m_pullup((m), (len)); \ if ((m) == NULL) \ return (NULL); \ } \ } while (0) /* * NOTE: If this function failed, the m_head would be freed. */ static __inline struct mbuf * hn_tso_fixup(struct mbuf *m_head) { struct ether_vlan_header *evl; struct tcphdr *th; int ehlen; KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); PULLUP_HDR(m_head, sizeof(*evl)); evl = mtod(m_head, struct ether_vlan_header *); if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; else ehlen = ETHER_HDR_LEN; m_head->m_pkthdr.l2hlen = ehlen; #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { struct ip *ip; int iphlen; PULLUP_HDR(m_head, ehlen + sizeof(*ip)); ip = mtodo(m_head, ehlen); iphlen = ip->ip_hl << 2; m_head->m_pkthdr.l3hlen = iphlen; PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); th = mtodo(m_head, ehlen + iphlen); ip->ip_len = 0; ip->ip_sum = 0; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { struct ip6_hdr *ip6; PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); ip6 = mtodo(m_head, ehlen); if (ip6->ip6_nxt != IPPROTO_TCP) { m_freem(m_head); return (NULL); } m_head->m_pkthdr.l3hlen = sizeof(*ip6); PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); th = mtodo(m_head, ehlen + sizeof(*ip6)); ip6->ip6_plen = 0; th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); } #endif return (m_head); } /* * NOTE: If this function failed, the m_head would be freed. */ static __inline struct mbuf * hn_set_hlen(struct mbuf *m_head) { const struct ether_vlan_header *evl; int ehlen; PULLUP_HDR(m_head, sizeof(*evl)); evl = mtod(m_head, const struct ether_vlan_header *); if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; else ehlen = ETHER_HDR_LEN; m_head->m_pkthdr.l2hlen = ehlen; #ifdef INET if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { const struct ip *ip; int iphlen; PULLUP_HDR(m_head, ehlen + sizeof(*ip)); ip = mtodo(m_head, ehlen); iphlen = ip->ip_hl << 2; m_head->m_pkthdr.l3hlen = iphlen; /* * UDP checksum offload does not work in Azure, if the * following conditions meet: * - sizeof(IP hdr + UDP hdr + payload) > 1420. * - IP_DF is not set in the IP hdr. * * Fallback to software checksum for these UDP datagrams. */ if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && (ntohs(ip->ip_off) & IP_DF) == 0) { uint16_t off = ehlen + iphlen; counter_u64_add(hn_udpcs_fixup, 1); PULLUP_HDR(m_head, off + sizeof(struct udphdr)); *(uint16_t *)(m_head->m_data + off + m_head->m_pkthdr.csum_data) = in_cksum_skip( m_head, m_head->m_pkthdr.len, off); m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; } } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { const struct ip6_hdr *ip6; PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); ip6 = mtodo(m_head, ehlen); if (ip6->ip6_nxt != IPPROTO_TCP && ip6->ip6_nxt != IPPROTO_UDP) { m_freem(m_head); return (NULL); } m_head->m_pkthdr.l3hlen = sizeof(*ip6); } #endif return (m_head); } /* * NOTE: If this function failed, the m_head would be freed. */ static __inline struct mbuf * hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) { const struct tcphdr *th; int ehlen, iphlen; *tcpsyn = 0; ehlen = m_head->m_pkthdr.l2hlen; iphlen = m_head->m_pkthdr.l3hlen; PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); th = mtodo(m_head, ehlen + iphlen); if (th->th_flags & TH_SYN) *tcpsyn = 1; return (m_head); } #undef PULLUP_HDR #endif /* INET6 || INET */ static int hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) { int error = 0; HN_LOCK_ASSERT(sc); if (sc->hn_rx_filter != filter) { error = hn_rndis_set_rxfilter(sc, filter); if (!error) sc->hn_rx_filter = filter; } return (error); } static int hn_rxfilter_config(struct hn_softc *sc) { if_t ifp = sc->hn_ifp; uint32_t filter; HN_LOCK_ASSERT(sc); /* * If the non-transparent mode VF is activated, we don't know how * its RX filter is configured, so stick the synthetic device in * the promiscous mode. */ if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { filter = NDIS_PACKET_TYPE_PROMISCUOUS; } else { filter = NDIS_PACKET_TYPE_DIRECTED; if (if_getflags(ifp) & IFF_BROADCAST) filter |= NDIS_PACKET_TYPE_BROADCAST; /* TODO: support multicast list */ if ((if_getflags(ifp) & IFF_ALLMULTI) || !if_maddr_empty(ifp)) filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; } return (hn_set_rxfilter(sc, filter)); } static void hn_set_txagg(struct hn_softc *sc) { uint32_t size, pkts; int i; /* * Setup aggregation size. */ if (sc->hn_agg_size < 0) size = UINT32_MAX; else size = sc->hn_agg_size; if (sc->hn_rndis_agg_size < size) size = sc->hn_rndis_agg_size; /* NOTE: We only aggregate packets using chimney sending buffers. */ if (size > (uint32_t)sc->hn_chim_szmax) size = sc->hn_chim_szmax; if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { /* Disable */ size = 0; pkts = 0; goto done; } /* NOTE: Type of the per TX ring setting is 'int'. */ if (size > INT_MAX) size = INT_MAX; /* * Setup aggregation packet count. */ if (sc->hn_agg_pkts < 0) pkts = UINT32_MAX; else pkts = sc->hn_agg_pkts; if (sc->hn_rndis_agg_pkts < pkts) pkts = sc->hn_rndis_agg_pkts; if (pkts <= 1) { /* Disable */ size = 0; pkts = 0; goto done; } /* NOTE: Type of the per TX ring setting is 'short'. */ if (pkts > SHRT_MAX) pkts = SHRT_MAX; done: /* NOTE: Type of the per TX ring setting is 'short'. */ if (sc->hn_rndis_agg_align > SHRT_MAX) { /* Disable */ size = 0; pkts = 0; } if (bootverbose) { if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", size, pkts, sc->hn_rndis_agg_align); } for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_agg_szmax = size; txr->hn_agg_pktmax = pkts; txr->hn_agg_align = sc->hn_rndis_agg_align; mtx_unlock(&txr->hn_tx_lock); } } static int hn_get_txswq_depth(const struct hn_tx_ring *txr) { KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); if (hn_tx_swq_depth < txr->hn_txdesc_cnt) return txr->hn_txdesc_cnt; return hn_tx_swq_depth; } static int hn_rss_reconfig(struct hn_softc *sc) { int error; HN_LOCK_ASSERT(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return (ENXIO); /* * Disable RSS first. * * NOTE: * Direct reconfiguration by setting the UNCHG flags does * _not_ work properly. */ if (bootverbose) if_printf(sc->hn_ifp, "disable RSS\n"); error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); if (error) { if_printf(sc->hn_ifp, "RSS disable failed\n"); return (error); } /* * Reenable the RSS w/ the updated RSS key or indirect * table. */ if (bootverbose) if_printf(sc->hn_ifp, "reconfig RSS\n"); error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); if (error) { if_printf(sc->hn_ifp, "RSS reconfig failed\n"); return (error); } return (0); } static void hn_rss_ind_fixup(struct hn_softc *sc) { struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int i, nchan; nchan = sc->hn_rx_ring_inuse; KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); /* * Check indirect table to make sure that all channels in it * can be used. */ for (i = 0; i < NDIS_HASH_INDCNT; ++i) { if (rss->rss_ind[i] >= nchan) { if_printf(sc->hn_ifp, "RSS indirect table %d fixup: %u -> %d\n", i, rss->rss_ind[i], nchan - 1); rss->rss_ind[i] = nchan - 1; } } } static int hn_ifmedia_upd(if_t ifp __unused) { return EOPNOTSUPP; } static void hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr) { struct hn_softc *sc = if_getsoftc(ifp); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { ifmr->ifm_active |= IFM_NONE; return; } ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_10G_T | IFM_FDX; } static void hn_rxvf_set_task(void *xarg, int pending __unused) { struct hn_rxvf_setarg *arg = xarg; arg->rxr->hn_rxvf_ifp = arg->vf_ifp; } static void hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp) { struct hn_rx_ring *rxr; struct hn_rxvf_setarg arg; struct task task; int i; HN_LOCK_ASSERT(sc); TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; if (i < sc->hn_rx_ring_inuse) { arg.rxr = rxr; arg.vf_ifp = vf_ifp; vmbus_chan_run_task(rxr->hn_chan, &task); } else { rxr->hn_rxvf_ifp = vf_ifp; } } } static bool hn_ismyvf(const struct hn_softc *sc, const if_t ifp) { if_t hn_ifp; hn_ifp = sc->hn_ifp; if (ifp == hn_ifp) return (false); if (if_getalloctype(ifp) != IFT_ETHER) return (false); /* Ignore lagg/vlan interfaces */ if (strcmp(if_getdname(ifp), "lagg") == 0 || strcmp(if_getdname(ifp), "vlan") == 0) return (false); /* * During detach events if_getifaddr(ifp) might be NULL. * Make sure the bcmp() below doesn't panic on that: */ if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL) return (false); if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0) return (false); return (true); } static void hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf) { if_t hn_ifp; HN_LOCK(sc); if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) goto out; if (!hn_ismyvf(sc, ifp)) goto out; hn_ifp = sc->hn_ifp; if (rxvf) { if (sc->hn_flags & HN_FLAG_RXVF) goto out; sc->hn_flags |= HN_FLAG_RXVF; hn_rxfilter_config(sc); } else { if (!(sc->hn_flags & HN_FLAG_RXVF)) goto out; sc->hn_flags &= ~HN_FLAG_RXVF; if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING) hn_rxfilter_config(sc); else hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); } hn_nvs_set_datapath(sc, rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); hn_rxvf_set(sc, rxvf ? ifp : NULL); if (rxvf) { hn_vf_rss_fixup(sc, true); hn_suspend_mgmt(sc); sc->hn_link_flags &= ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); if_link_state_change(hn_ifp, LINK_STATE_DOWN); } else { hn_vf_rss_restore(sc); hn_resume_mgmt(sc); } devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp), rxvf ? "VF_UP" : "VF_DOWN", NULL); if (bootverbose) { if_printf(hn_ifp, "datapath is switched %s %s\n", rxvf ? "to" : "from", if_name(ifp)); } out: HN_UNLOCK(sc); } static void hn_ifnet_event(void *arg, if_t ifp, int event) { if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) return; hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); } static void hn_ifaddr_event(void *arg, if_t ifp) { hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP); } static int hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr __unused) { if_t ifp, vf_ifp; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; vf_ifp = sc->hn_vf_ifp; /* * Just sync up with VF's enabled capabilities. */ if_setcapenable(ifp, if_getcapenable(vf_ifp)); if_sethwassist(ifp, if_gethwassist(vf_ifp)); return (0); } static int hn_xpnt_vf_iocsetflags(struct hn_softc *sc) { if_t vf_ifp; struct ifreq ifr; HN_LOCK_ASSERT(sc); vf_ifp = sc->hn_vf_ifp; memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff; ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16; return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread)); } static void hn_xpnt_vf_saveifflags(struct hn_softc *sc) { if_t ifp = sc->hn_ifp; int allmulti = 0; HN_LOCK_ASSERT(sc); /* XXX vlan(4) style mcast addr maintenance */ if (!if_maddr_empty(ifp)) allmulti = IFF_ALLMULTI; /* Always set the VF's if_flags */ if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti); } static void hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m) { struct rm_priotracker pt; if_t hn_ifp = NULL; struct mbuf *mn; /* * XXX racy, if hn(4) ever detached. */ rm_rlock(&hn_vfmap_lock, &pt); if (if_getindex(vf_ifp) < hn_vfmap_size) hn_ifp = hn_vfmap[if_getindex(vf_ifp)]; rm_runlock(&hn_vfmap_lock, &pt); if (hn_ifp != NULL) { for (mn = m; mn != NULL; mn = mn->m_nextpkt) { /* * Allow tapping on the VF. */ ETHER_BPF_MTAP(vf_ifp, mn); /* * Update VF stats. */ if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) { if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, mn->m_pkthdr.len); } /* * XXX IFCOUNTER_IMCAST * This stat updating is kinda invasive, since it * requires two checks on the mbuf: the length check * and the ethernet header check. As of this write, * all multicast packets go directly to hn(4), which * makes imcast stat updating in the VF a try in vian. */ /* * Fix up rcvif and increase hn(4)'s ipackets. */ mn->m_pkthdr.rcvif = hn_ifp; if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); } /* * Go through hn(4)'s if_input. */ if_input(hn_ifp, m); } else { /* * In the middle of the transition; free this * mbuf chain. */ while (m != NULL) { mn = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); m = mn; } } } static void hn_mtu_change_fixup(struct hn_softc *sc) { if_t ifp; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp)); if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); } static uint32_t hn_rss_type_fromndis(uint32_t rss_hash) { uint32_t types = 0; if (rss_hash & NDIS_HASH_IPV4) types |= RSS_TYPE_IPV4; if (rss_hash & NDIS_HASH_TCP_IPV4) types |= RSS_TYPE_TCP_IPV4; if (rss_hash & NDIS_HASH_IPV6) types |= RSS_TYPE_IPV6; if (rss_hash & NDIS_HASH_IPV6_EX) types |= RSS_TYPE_IPV6_EX; if (rss_hash & NDIS_HASH_TCP_IPV6) types |= RSS_TYPE_TCP_IPV6; if (rss_hash & NDIS_HASH_TCP_IPV6_EX) types |= RSS_TYPE_TCP_IPV6_EX; if (rss_hash & NDIS_HASH_UDP_IPV4_X) types |= RSS_TYPE_UDP_IPV4; return (types); } static uint32_t hn_rss_type_tondis(uint32_t types) { uint32_t rss_hash = 0; KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, ("UDP6 and UDP6EX are not supported")); if (types & RSS_TYPE_IPV4) rss_hash |= NDIS_HASH_IPV4; if (types & RSS_TYPE_TCP_IPV4) rss_hash |= NDIS_HASH_TCP_IPV4; if (types & RSS_TYPE_IPV6) rss_hash |= NDIS_HASH_IPV6; if (types & RSS_TYPE_IPV6_EX) rss_hash |= NDIS_HASH_IPV6_EX; if (types & RSS_TYPE_TCP_IPV6) rss_hash |= NDIS_HASH_TCP_IPV6; if (types & RSS_TYPE_TCP_IPV6_EX) rss_hash |= NDIS_HASH_TCP_IPV6_EX; if (types & RSS_TYPE_UDP_IPV4) rss_hash |= NDIS_HASH_UDP_IPV4_X; return (rss_hash); } static void hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) { int i; HN_LOCK_ASSERT(sc); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; } static void hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) { if_t ifp, vf_ifp; struct ifrsshash ifrh; struct ifrsskey ifrk; int error; uint32_t my_types, diff_types, mbuf_types = 0; HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("%s: synthetic parts are not attached", if_name(sc->hn_ifp))); if (sc->hn_rx_ring_inuse == 1) { /* No RSS on synthetic parts; done. */ return; } if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { /* Synthetic parts do not support Toeplitz; done. */ return; } ifp = sc->hn_ifp; vf_ifp = sc->hn_vf_ifp; /* * Extract VF's RSS key. Only 40 bytes key for Toeplitz is * supported. */ memset(&ifrk, 0, sizeof(ifrk)); strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name)); error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread); if (error) { if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", if_name(vf_ifp), error); goto done; } if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { if_printf(ifp, "%s RSS function %u is not Toeplitz\n", if_name(vf_ifp), ifrk.ifrk_func); goto done; } if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", if_name(vf_ifp), ifrk.ifrk_keylen); goto done; } /* * Extract VF's RSS hash. Only Toeplitz is supported. */ memset(&ifrh, 0, sizeof(ifrh)); strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name)); error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread); if (error) { if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", if_name(vf_ifp), error); goto done; } if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { if_printf(ifp, "%s RSS function %u is not Toeplitz\n", if_name(vf_ifp), ifrh.ifrh_func); goto done; } my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); if ((ifrh.ifrh_types & my_types) == 0) { /* This disables RSS; ignore it then */ if_printf(ifp, "%s intersection of RSS types failed. " "VF %#x, mine %#x\n", if_name(vf_ifp), ifrh.ifrh_types, my_types); goto done; } diff_types = my_types ^ ifrh.ifrh_types; my_types &= ifrh.ifrh_types; mbuf_types = my_types; /* * Detect RSS hash value/type confliction. * * NOTE: * We don't disable the hash type, but stop delivery the hash * value/type through mbufs on RX path. * * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple * hash is delivered with type of TCP_IPV4. This means if * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at * least to hn_mbuf_hash. However, given that _all_ of the * NICs implement TCP_IPV4, this will _not_ impose any issues * here. */ if ((my_types & RSS_TYPE_IPV4) && (diff_types & ifrh.ifrh_types & (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { /* Conflict; disable IPV4 hash type/value delivery. */ if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_IPV4; } if ((my_types & RSS_TYPE_IPV6) && (diff_types & ifrh.ifrh_types & (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | RSS_TYPE_IPV6_EX))) { /* Conflict; disable IPV6 hash type/value delivery. */ if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_IPV6; } if ((my_types & RSS_TYPE_IPV6_EX) && (diff_types & ifrh.ifrh_types & (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | RSS_TYPE_IPV6))) { /* Conflict; disable IPV6_EX hash type/value delivery. */ if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_IPV6_EX; } if ((my_types & RSS_TYPE_TCP_IPV6) && (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { /* Conflict; disable TCP_IPV6 hash type/value delivery. */ if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_TCP_IPV6; } if ((my_types & RSS_TYPE_TCP_IPV6_EX) && (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; } if ((my_types & RSS_TYPE_UDP_IPV6) && (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { /* Conflict; disable UDP_IPV6 hash type/value delivery. */ if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_UDP_IPV6; } if ((my_types & RSS_TYPE_UDP_IPV6_EX) && (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; } /* * Indirect table does not matter. */ sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | hn_rss_type_tondis(my_types); memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); sc->hn_flags |= HN_FLAG_HAS_RSSKEY; if (reconf) { error = hn_rss_reconfig(sc); if (error) { /* XXX roll-back? */ if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); /* XXX keep going. */ } } done: /* Hash deliverability for mbufs. */ hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); } static void hn_vf_rss_restore(struct hn_softc *sc) { HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("%s: synthetic parts are not attached", if_name(sc->hn_ifp))); if (sc->hn_rx_ring_inuse == 1) goto done; /* * Restore hash types. Key does _not_ matter. */ if (sc->hn_rss_hash != sc->hn_rss_hcap) { int error; sc->hn_rss_hash = sc->hn_rss_hcap; error = hn_rss_reconfig(sc); if (error) { if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", error); /* XXX keep going. */ } } done: /* Hash deliverability for mbufs. */ hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); } static void hn_xpnt_vf_setready(struct hn_softc *sc) { if_t ifp, vf_ifp; struct ifreq ifr; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; vf_ifp = sc->hn_vf_ifp; /* * Mark the VF ready. */ sc->hn_vf_rdytick = 0; /* * Save information for restoration. */ sc->hn_saved_caps = if_getcapabilities(ifp); sc->hn_saved_tsomax = if_gethwtsomax(ifp); sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp); sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp); sc->hn_saved_capenable = if_getcapenable(ifp); sc->hn_saved_hwassist = if_gethwassist(ifp); /* * Intersect supported/enabled capabilities. * * NOTE: * if_hwassist is not changed here. */ if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp)); if_setcapenablebit(ifp, 0, if_getcapabilities(ifp)); /* * Fix TSO settings. */ if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp)) if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp)); if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp)) if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp)); if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp)) if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp)); /* * Change VF's enabled capabilities. */ memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); ifr.ifr_reqcap = if_getcapenable(ifp); hn_xpnt_vf_iocsetcaps(sc, &ifr); if (if_getmtu(ifp) != ETHERMTU) { int error; /* * Change VF's MTU. */ memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); ifr.ifr_mtu = if_getmtu(ifp); error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread); if (error) { if_printf(ifp, "%s SIOCSIFMTU %u failed\n", if_name(vf_ifp), if_getmtu(ifp)); if (if_getmtu(ifp) > ETHERMTU) { if_printf(ifp, "change MTU to %d\n", ETHERMTU); /* * XXX * No need to adjust the synthetic parts' MTU; * failure of the adjustment will cause us * infinite headache. */ if_setmtu(ifp, ETHERMTU); hn_mtu_change_fixup(sc); } } } } static bool hn_xpnt_vf_isready(struct hn_softc *sc) { HN_LOCK_ASSERT(sc); if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) return (false); if (sc->hn_vf_rdytick == 0) return (true); if (sc->hn_vf_rdytick > ticks) return (false); /* Mark VF as ready. */ hn_xpnt_vf_setready(sc); return (true); } static void hn_xpnt_vf_setenable(struct hn_softc *sc) { int i; HN_LOCK_ASSERT(sc); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; rm_wunlock(&sc->hn_vf_lock); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; } static void hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) { int i; HN_LOCK_ASSERT(sc); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; if (clear_vf) sc->hn_vf_ifp = NULL; rm_wunlock(&sc->hn_vf_lock); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; } static void hn_xpnt_vf_init(struct hn_softc *sc) { int error; HN_LOCK_ASSERT(sc); KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, ("%s: transparent VF was enabled", if_name(sc->hn_ifp))); if (bootverbose) { if_printf(sc->hn_ifp, "try bringing up %s\n", if_name(sc->hn_vf_ifp)); } /* * Bring the VF up. */ hn_xpnt_vf_saveifflags(sc); if_setflagbits(sc->hn_ifp, IFF_UP, 0); error = hn_xpnt_vf_iocsetflags(sc); if (error) { if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", if_name(sc->hn_vf_ifp), error); return; } /* * NOTE: * Datapath setting must happen _after_ bringing the VF up. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); /* * NOTE: * Fixup RSS related bits _after_ the VF is brought up, since * many VFs generate RSS key during it's initialization. */ hn_vf_rss_fixup(sc, true); /* Mark transparent mode VF as enabled. */ hn_xpnt_vf_setenable(sc); } static void hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) goto done; if (sc->hn_vf_ifp == NULL) goto done; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) goto done; if (sc->hn_vf_rdytick != 0) { /* Mark VF as ready. */ hn_xpnt_vf_setready(sc); } if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) { /* * Delayed VF initialization. */ if (bootverbose) { if_printf(sc->hn_ifp, "delayed initialize %s\n", if_name(sc->hn_vf_ifp)); } hn_xpnt_vf_init(sc); } done: HN_UNLOCK(sc); } static void hn_ifnet_attevent(void *xsc, if_t ifp) { struct hn_softc *sc = xsc; HN_LOCK(sc); if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) goto done; if (!hn_ismyvf(sc, ifp)) goto done; if (sc->hn_vf_ifp != NULL) { if_printf(sc->hn_ifp, "%s was attached as VF\n", if_name(sc->hn_vf_ifp)); goto done; } if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) { /* * ifnet.if_start is _not_ supported by transparent * mode VF; mainly due to the IFF_DRV_OACTIVE flag. */ if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " "in transparent VF mode.\n", if_name(sc->hn_vf_ifp)); goto done; } rm_wlock(&hn_vfmap_lock); if (if_getindex(ifp) >= hn_vfmap_size) { if_t *newmap; int newsize; newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF; newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF, M_WAITOK | M_ZERO); memcpy(newmap, hn_vfmap, sizeof(if_t) * hn_vfmap_size); free(hn_vfmap, M_DEVBUF); hn_vfmap = newmap; hn_vfmap_size = newsize; } KASSERT(hn_vfmap[if_getindex(ifp)] == NULL, ("%s: ifindex %d was mapped to %s", if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)]))); hn_vfmap[if_getindex(ifp)] = sc->hn_ifp; rm_wunlock(&hn_vfmap_lock); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, ("%s: transparent VF was enabled", if_name(sc->hn_ifp))); sc->hn_vf_ifp = ifp; rm_wunlock(&sc->hn_vf_lock); if (hn_xpnt_vf) { int wait_ticks; /* * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. * Save vf_ifp's current if_input for later restoration. */ sc->hn_vf_input = if_getinputfn(ifp); if_setinputfn(ifp, hn_xpnt_vf_input); /* * Stop link status management; use the VF's. */ hn_suspend_mgmt(sc); /* * Give VF sometime to complete its attach routing. */ wait_ticks = hn_xpnt_vf_attwait * hz; sc->hn_vf_rdytick = ticks + wait_ticks; taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, wait_ticks); } done: HN_UNLOCK(sc); } static void hn_ifnet_detevent(void *xsc, if_t ifp) { struct hn_softc *sc = xsc; HN_LOCK(sc); if (sc->hn_vf_ifp == NULL) goto done; if (!hn_ismyvf(sc, ifp)) goto done; if (hn_xpnt_vf) { /* * Make sure that the delayed initialization is not running. * * NOTE: * - This lock _must_ be released, since the hn_vf_init task * will try holding this lock. * - It is safe to release this lock here, since the * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. * * XXX racy, if hn(4) ever detached. */ HN_UNLOCK(sc); taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); HN_LOCK(sc); KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", if_name(sc->hn_ifp))); if_setinputfn(ifp, sc->hn_vf_input); sc->hn_vf_input = NULL; if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); if (sc->hn_vf_rdytick == 0) { /* * The VF was ready; restore some settings. */ if_setcapabilities(ifp, sc->hn_saved_caps); if_sethwtsomax(ifp, sc->hn_saved_tsomax); if_sethwtsomaxsegcount(sc->hn_ifp, sc->hn_saved_tsosegcnt); if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz); if_setcapenable(ifp, sc->hn_saved_capenable); if_sethwassist(ifp, sc->hn_saved_hwassist); } if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { /* * Restore RSS settings. */ hn_vf_rss_restore(sc); /* * Resume link status management, which was suspended * by hn_ifnet_attevent(). */ hn_resume_mgmt(sc); } } /* Mark transparent mode VF as disabled. */ hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); rm_wlock(&hn_vfmap_lock); KASSERT(if_getindex(ifp) < hn_vfmap_size, ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size)); if (hn_vfmap[if_getindex(ifp)] != NULL) { KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp, ("%s: ifindex %d was mapped to %s", if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)]))); hn_vfmap[if_getindex(ifp)] = NULL; } rm_wunlock(&hn_vfmap_lock); done: HN_UNLOCK(sc); } static void hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state) { struct hn_softc *sc = xsc; if (sc->hn_vf_ifp == ifp) if_link_state_change(sc->hn_ifp, link_state); } static int hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int tsomax; int error; tsomax = if_gethwtsomax(sc->hn_ifp); error = sysctl_handle_int(oidp, &tsomax, 0, req); return error; } static int hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int tsomaxsegcnt; int error; tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp); error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req); return error; } static int hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int tsomaxsegsz; int error; tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp); error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req); return error; } static int hn_probe(device_t dev) { if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { device_set_desc(dev, "Hyper-V Network Interface"); return BUS_PROBE_DEFAULT; } return ENXIO; } static int hn_attach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; uint8_t eaddr[ETHER_ADDR_LEN]; if_t ifp = NULL; int error, ring_cnt, tx_ring_cnt; uint32_t mtu; sc->hn_dev = dev; sc->hn_prichan = vmbus_get_channel(dev); HN_LOCK_INIT(sc); rm_init(&sc->hn_vf_lock, "hnvf"); if (hn_xpnt_vf && hn_xpnt_vf_accbpf) sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; /* * Initialize these tunables once. */ sc->hn_agg_size = hn_tx_agg_size; sc->hn_agg_pkts = hn_tx_agg_pkts; /* * Setup taskqueue for transmission. */ if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { int i; sc->hn_tx_taskqs = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), M_DEVBUF, M_WAITOK); for (i = 0; i < hn_tx_taskq_cnt; ++i) { sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_tx_taskqs[i]); taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, "%s tx%d", device_get_nameunit(dev), i); } } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { sc->hn_tx_taskqs = hn_tx_taskque; } /* * Setup taskqueue for mangement tasks, e.g. link status. */ sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", device_get_nameunit(dev)); TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, hn_netchg_status_taskfunc, sc); if (hn_xpnt_vf) { /* * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. */ sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_vf_taskq); taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", device_get_nameunit(dev)); TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, hn_xpnt_vf_init_taskfunc, sc); } /* * Allocate ifnet and setup its name earlier, so that if_printf * can be used by functions, which will be called after * ether_ifattach(). */ ifp = sc->hn_ifp = if_alloc(IFT_ETHER); if_setsoftc(ifp, sc); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); /* * Initialize ifmedia earlier so that it can be unconditionally * destroyed, if error happened later on. */ ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); /* * Figure out the # of RX rings (ring_cnt) and the # of TX rings * to use (tx_ring_cnt). * * NOTE: * The # of RX rings to use is same as the # of channels to use. */ ring_cnt = hn_chan_cnt; if (ring_cnt <= 0) { /* Default */ ring_cnt = mp_ncpus; if (ring_cnt > HN_RING_CNT_DEF_MAX) ring_cnt = HN_RING_CNT_DEF_MAX; } else if (ring_cnt > mp_ncpus) { ring_cnt = mp_ncpus; } #ifdef RSS if (ring_cnt > rss_getnumbuckets()) ring_cnt = rss_getnumbuckets(); #endif tx_ring_cnt = hn_tx_ring_cnt; if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) tx_ring_cnt = ring_cnt; #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { /* ifnet.if_start only needs one TX ring. */ tx_ring_cnt = 1; } #endif /* * Set the leader CPU for channels. */ sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; /* * Create enough TX/RX rings, even if only limited number of * channels can be allocated. */ error = hn_create_tx_data(sc, tx_ring_cnt); if (error) goto failed; error = hn_create_rx_data(sc, ring_cnt); if (error) goto failed; /* * Create transaction context for NVS and RNDIS transactions. */ sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); if (sc->hn_xact == NULL) { error = ENXIO; goto failed; } /* * Install orphan handler for the revocation of this device's * primary channel. * * NOTE: * The processing order is critical here: * Install the orphan handler, _before_ testing whether this * device's primary channel has been revoked or not. */ vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); if (vmbus_chan_is_revoked(sc->hn_prichan)) { error = ENXIO; goto failed; } /* * Attach the synthetic parts, i.e. NVS and RNDIS. */ error = hn_synth_attach(sc, ETHERMTU); if (error) goto failed; error = hn_rndis_get_eaddr(sc, eaddr); if (error) goto failed; error = hn_rndis_get_mtu(sc, &mtu); if (error) mtu = ETHERMTU; else if (bootverbose) device_printf(dev, "RNDIS mtu %u\n", mtu); if (sc->hn_rx_ring_inuse > 1) { /* * Reduce TCP segment aggregation limit for multiple * RX rings to increase ACK timeliness. */ hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); } /* * Fixup TX/RX stuffs after synthetic parts are attached. */ hn_fixup_tx_data(sc); hn_fixup_rx_data(sc); ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, &sc->hn_nvs_ver, 0, "NVS version"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_ndis_version_sysctl, "A", "NDIS version"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_caps_sysctl, "A", "capabilities"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_hwassist_sysctl, "A", "hwassist"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max", CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl, "IU", "max TSO size"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt", CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl, "IU", "max # of TSO segments"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz", CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl, "IU", "max size of TSO segment"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rxfilter_sysctl, "A", "rxfilter"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rss_hash_sysctl, "A", "RSS hash"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); #ifndef RSS /* * Don't allow RSS key/indirect table changes, if RSS is defined. */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_key_sysctl, "IU", "RSS key"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_ind_sysctl, "IU", "RSS indirect table"); #endif SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, "RNDIS offered packet transmission aggregation size limit"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, "RNDIS offered packet transmission aggregation count limit"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, "RNDIS packet transmission aggregation alignment"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_txagg_size_sysctl, "I", "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_txagg_pkts_sysctl, "I", "Packet transmission aggregation packets, " "0 -- disable, -1 -- auto"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_polling_sysctl, "I", "Polling frequency: [100,1000000], 0 disable polling"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_vf_sysctl, "A", "Virtual Function's name"); if (!hn_xpnt_vf) { SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rxvf_sysctl, "A", "activated Virtual Function's name"); } else { SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_xpnt_vf_enabled_sysctl, "I", "Transparent VF enabled"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_xpnt_vf_accbpf_sysctl, "I", "Accurate BPF for transparent VF"); } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch", CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A", "switch to rsc"); /* * Setup the ifmedia, which has been initialized earlier. */ ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); /* XXX ifmedia_set really should do this for us */ sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; /* * Setup the ifnet for this interface. */ if_setbaudrate(ifp, IF_Gbps(10)); if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); if_setioctlfn(ifp, hn_ioctl); if_setinitfn(ifp, hn_init); #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); if_setstartfn(ifp, hn_start); if_setsendqlen(ifp, qdepth); if_setsendqready(ifp); } else #endif { if_settransmitfn(ifp, hn_transmit); if_setqflushfn(ifp, hn_xmit_qflush); } if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0); #ifdef foo /* We can't diff IPv6 packets from IPv4 packets on RX path. */ if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0); #endif if (sc->hn_caps & HN_CAP_VLAN) { /* XXX not sure about VLAN_MTU. */ if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0); } if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist); if (if_gethwassist(ifp) & HN_CSUM_IP_MASK) if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0); if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK) if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0); if (sc->hn_caps & HN_CAP_TSO4) { if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0); if_sethwassistbits(ifp, CSUM_IP_TSO, 0); } if (sc->hn_caps & HN_CAP_TSO6) { if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0); if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); } /* Enable all available capabilities by default. */ if_setcapenable(ifp, if_getcapabilities(ifp)); /* * Disable IPv6 TSO and TXCSUM by default, they still can * be enabled through SIOCSIFCAP. */ if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6)); if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO)); if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) { /* * Lock hn_set_tso_maxsize() to simplify its * internal logic. */ HN_LOCK(sc); hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); HN_UNLOCK(sc); if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX); if_sethwtsomaxsegsize(ifp, PAGE_SIZE); } ether_ifattach(ifp, eaddr); if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { if_printf(ifp, "TSO segcnt %u segsz %u\n", if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp)); } if (mtu < ETHERMTU) { if_setmtu(ifp, mtu); } /* Inform the upper layer about the long frame support. */ if_setifheaderlen(ifp, sizeof(struct ether_vlan_header)); /* * Kick off link status check. */ sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; hn_update_link_status(sc); if (!hn_xpnt_vf) { sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); } else { sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); } /* * NOTE: * Subscribe ether_ifattach event, instead of ifnet_arrival event, * since interface's LLADDR is needed; interface LLADDR is not * available when ifnet_arrival event is triggered. */ sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); return (0); failed: if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) hn_synth_detach(sc); hn_detach(dev); return (error); } static int hn_detach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); if_t ifp = sc->hn_ifp, vf_ifp; if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { /* * In case that the vmbus missed the orphan handler * installation. */ vmbus_xact_ctx_orphan(sc->hn_xact); } if (sc->hn_ifaddr_evthand != NULL) EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); if (sc->hn_ifnet_evthand != NULL) EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); if (sc->hn_ifnet_atthand != NULL) { EVENTHANDLER_DEREGISTER(ether_ifattach_event, sc->hn_ifnet_atthand); } if (sc->hn_ifnet_dethand != NULL) { EVENTHANDLER_DEREGISTER(ifnet_departure_event, sc->hn_ifnet_dethand); } if (sc->hn_ifnet_lnkhand != NULL) EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); vf_ifp = sc->hn_vf_ifp; __compiler_membar(); if (vf_ifp != NULL) hn_ifnet_detevent(sc, vf_ifp); if (device_is_attached(dev)) { HN_LOCK(sc); if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) hn_stop(sc, true); /* * NOTE: * hn_stop() only suspends data, so managment * stuffs have to be suspended manually here. */ hn_suspend_mgmt(sc); hn_synth_detach(sc); } HN_UNLOCK(sc); ether_ifdetach(ifp); } ifmedia_removeall(&sc->hn_media); hn_destroy_rx_data(sc); hn_destroy_tx_data(sc); if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { int i; for (i = 0; i < hn_tx_taskq_cnt; ++i) taskqueue_free(sc->hn_tx_taskqs[i]); free(sc->hn_tx_taskqs, M_DEVBUF); } taskqueue_free(sc->hn_mgmt_taskq0); if (sc->hn_vf_taskq != NULL) taskqueue_free(sc->hn_vf_taskq); if (sc->hn_xact != NULL) { /* * Uninstall the orphan handler _before_ the xact is * destructed. */ vmbus_chan_unset_orphan(sc->hn_prichan); vmbus_xact_ctx_destroy(sc->hn_xact); } if_free(ifp); HN_LOCK_DESTROY(sc); rm_destroy(&sc->hn_vf_lock); return (0); } static int hn_shutdown(device_t dev) { return (0); } static void hn_link_status(struct hn_softc *sc) { uint32_t link_status; int error; error = hn_rndis_get_linkstatus(sc, &link_status); if (error) { /* XXX what to do? */ return; } if (link_status == NDIS_MEDIA_STATE_CONNECTED) sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; else sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; if_link_state_change(sc->hn_ifp, (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? LINK_STATE_UP : LINK_STATE_DOWN); } static void hn_link_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) return; hn_link_status(sc); } static void hn_netchg_init_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; /* Prevent any link status checks from running. */ sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; /* * Fake up a [link down --> link up] state change; 5 seconds * delay is used, which closely simulates miibus reaction * upon link down event. */ sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 5 * hz); } static void hn_netchg_status_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; /* Re-allow link status checks. */ sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; hn_link_status(sc); } static void hn_update_link_status(struct hn_softc *sc) { if (sc->hn_mgmt_taskq != NULL) taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); } static void hn_change_network(struct hn_softc *sc) { if (sc->hn_mgmt_taskq != NULL) taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); } static __inline int hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) { struct mbuf *m = *m_head; int error; KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); if (error == EFBIG) { struct mbuf *m_new; m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); if (m_new == NULL) return ENOBUFS; else *m_head = m = m_new; txr->hn_tx_collapsed++; error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); } if (!error) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_PREWRITE); txd->flags |= HN_TXD_FLAG_DMAMAP; } return error; } static __inline int hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, ("put an onlist txd %#x", txd->flags)); KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("put an onagg txd %#x", txd->flags)); KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); if (atomic_fetchadd_int(&txd->refs, -1) != 1) return 0; if (!STAILQ_EMPTY(&txd->agg_list)) { struct hn_txdesc *tmp_txd; while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { int freed __diagused; KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), ("resursive aggregation on aggregated txdesc")); KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), ("not aggregated txdesc")); KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("aggregated txdesc uses dmamap")); KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("aggregated txdesc consumes " "chimney sending buffer")); KASSERT(tmp_txd->chim_size == 0, ("aggregated txdesc has non-zero " "chimney sending size")); STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; freed = hn_txdesc_put(txr, tmp_txd); KASSERT(freed, ("failed to free aggregated txdesc")); } } if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("chim txd uses dmamap")); hn_chim_free(txr->hn_sc, txd->chim_index); txd->chim_index = HN_NVS_CHIM_IDX_INVALID; txd->chim_size = 0; } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->hn_tx_data_dtag, txd->data_dmap); txd->flags &= ~HN_TXD_FLAG_DMAMAP; } if (txd->m != NULL) { m_freem(txd->m); txd->m = NULL; } txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); KASSERT(txr->hn_txdesc_avail >= 0 && txr->hn_txdesc_avail < txr->hn_txdesc_cnt, ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail++; SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); mtx_unlock_spin(&txr->hn_txlist_spin); #else /* HN_USE_TXDESC_BUFRING */ #ifdef HN_DEBUG atomic_add_int(&txr->hn_txdesc_avail, 1); #endif buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif /* !HN_USE_TXDESC_BUFRING */ return 1; } static __inline struct hn_txdesc * hn_txdesc_get(struct hn_tx_ring *txr) { struct hn_txdesc *txd; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); txd = SLIST_FIRST(&txr->hn_txlist); if (txd != NULL) { KASSERT(txr->hn_txdesc_avail > 0, ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail--; SLIST_REMOVE_HEAD(&txr->hn_txlist, link); } mtx_unlock_spin(&txr->hn_txlist_spin); #else txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); #endif if (txd != NULL) { #ifdef HN_USE_TXDESC_BUFRING #ifdef HN_DEBUG atomic_subtract_int(&txr->hn_txdesc_avail, 1); #endif #endif /* HN_USE_TXDESC_BUFRING */ KASSERT(txd->m == NULL && txd->refs == 0 && STAILQ_EMPTY(&txd->agg_list) && txd->chim_index == HN_NVS_CHIM_IDX_INVALID && txd->chim_size == 0 && (txd->flags & HN_TXD_FLAG_ONLIST) && (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); txd->flags &= ~HN_TXD_FLAG_ONLIST; txd->refs = 1; } return txd; } static __inline void hn_txdesc_hold(struct hn_txdesc *txd) { /* 0->1 transition will never work */ KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); atomic_add_int(&txd->refs, 1); } static __inline void hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) { KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("recursive aggregation on aggregating txdesc")); KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("already aggregated")); KASSERT(STAILQ_EMPTY(&txd->agg_list), ("recursive aggregation on to-be-aggregated txdesc")); txd->flags |= HN_TXD_FLAG_ONAGG; STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); } static bool hn_tx_ring_pending(struct hn_tx_ring *txr) { bool pending = false; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) pending = true; mtx_unlock_spin(&txr->hn_txlist_spin); #else if (!buf_ring_full(txr->hn_txdesc_br)) pending = true; #endif return (pending); } static __inline void hn_txeof(struct hn_tx_ring *txr) { txr->hn_has_txeof = 0; txr->hn_txeof(txr); } static void hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, struct vmbus_channel *chan, const void *data __unused, int dlen __unused) { struct hn_txdesc *txd = sndc->hn_cbarg; struct hn_tx_ring *txr; txr = txd->txr; KASSERT(txr->hn_chan == chan, ("channel mismatch, on chan%u, should be chan%u", vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); txr->hn_has_txeof = 1; hn_txdesc_put(txr, txd); ++txr->hn_txdone_cnt; if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { txr->hn_txdone_cnt = 0; if (txr->hn_oactive) hn_txeof(txr); } } static void hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) { #if defined(INET) || defined(INET6) struct epoch_tracker et; NET_EPOCH_ENTER(et); tcp_lro_flush_all(&rxr->hn_lro); NET_EPOCH_EXIT(et); #endif /* * NOTE: * 'txr' could be NULL, if multiple channels and * ifnet.if_start method are enabled. */ if (txr == NULL || !txr->hn_has_txeof) return; txr->hn_txdone_cnt = 0; hn_txeof(txr); } static __inline uint32_t hn_rndis_pktmsg_offset(uint32_t ofs) { KASSERT(ofs >= sizeof(struct rndis_packet_msg), ("invalid RNDIS packet msg offset %u", ofs)); return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); } static __inline void * hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, size_t pi_dlen, uint32_t pi_type) { const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); struct rndis_pktinfo *pi; KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); /* * Per-packet-info does not move; it only grows. * * NOTE: * rm_pktinfooffset in this phase counts from the beginning * of rndis_packet_msg. */ KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, ("%u pktinfo overflows RNDIS packet msg", pi_type)); pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + pkt->rm_pktinfolen); pkt->rm_pktinfolen += pi_size; pi->rm_size = pi_size; pi->rm_type = pi_type; pi->rm_internal = 0; pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; return (pi->rm_data); } static __inline int hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr) { struct hn_txdesc *txd; struct mbuf *m; int error, pkts; txd = txr->hn_agg_txd; KASSERT(txd != NULL, ("no aggregate txdesc")); /* * Since hn_txpkt() will reset this temporary stat, save * it now, so that oerrors can be updated properly, if * hn_txpkt() ever fails. */ pkts = txr->hn_stat_pkts; /* * Since txd's mbuf will _not_ be freed upon hn_txpkt() * failure, save it for later freeing, if hn_txpkt() ever * fails. */ m = txd->m; error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m is not. */ m_freem(m); txr->hn_flush_failed++; if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); } /* Reset all aggregation states. */ txr->hn_agg_txd = NULL; txr->hn_agg_szleft = 0; txr->hn_agg_pktleft = 0; txr->hn_agg_prevpkt = NULL; return (error); } static void * hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, int pktsize) { void *chim; if (txr->hn_agg_txd != NULL) { if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { struct hn_txdesc *agg_txd = txr->hn_agg_txd; struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; int olen; /* * Update the previous RNDIS packet's total length, * it can be increased due to the mandatory alignment * padding for this RNDIS packet. And update the * aggregating txdesc's chimney sending buffer size * accordingly. * * XXX * Zero-out the padding, as required by the RNDIS spec. */ olen = pkt->rm_len; pkt->rm_len = roundup2(olen, txr->hn_agg_align); agg_txd->chim_size += pkt->rm_len - olen; /* Link this txdesc to the parent. */ hn_txdesc_agg(agg_txd, txd); chim = (uint8_t *)pkt + pkt->rm_len; /* Save the current packet for later fixup. */ txr->hn_agg_prevpkt = chim; txr->hn_agg_pktleft--; txr->hn_agg_szleft -= pktsize; if (txr->hn_agg_szleft <= HN_PKTSIZE_MIN(txr->hn_agg_align)) { /* * Probably can't aggregate more packets, * flush this aggregating txdesc proactively. */ txr->hn_agg_pktleft = 0; } /* Done! */ return (chim); } hn_flush_txagg(ifp, txr); } KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); txr->hn_tx_chimney_tried++; txd->chim_index = hn_chim_alloc(txr->hn_sc); if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) return (NULL); txr->hn_tx_chimney++; chim = txr->hn_sc->hn_chim + (txd->chim_index * txr->hn_sc->hn_chim_szmax); if (txr->hn_agg_pktmax > 1 && txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { txr->hn_agg_txd = txd; txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; txr->hn_agg_prevpkt = chim; } return (chim); } /* * NOTE: * If this function fails, then both txd and m_head0 will be freed. */ static int hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) { bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; int error, nsegs, i; struct mbuf *m_head = *m_head0; struct rndis_packet_msg *pkt; uint32_t *pi_data; void *chim = NULL; int pkt_hlen, pkt_size; pkt = txd->rndis_pkt; pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); if (pkt_size < txr->hn_chim_size) { chim = hn_try_txagg(ifp, txr, txd, pkt_size); if (chim != NULL) pkt = chim; } else { if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); } pkt->rm_type = REMOTE_NDIS_PACKET_MSG; pkt->rm_len = m_head->m_pkthdr.len; pkt->rm_dataoffset = 0; pkt->rm_datalen = m_head->m_pkthdr.len; pkt->rm_oobdataoffset = 0; pkt->rm_oobdatalen = 0; pkt->rm_oobdataelements = 0; pkt->rm_pktinfooffset = sizeof(*pkt); pkt->rm_pktinfolen = 0; pkt->rm_vchandle = 0; pkt->rm_reserved = 0; if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { /* * Set the hash value for this packet. */ pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); if (M_HASHTYPE_ISHASH(m_head)) /* * The flowid field contains the hash value host * set in the rx queue if it is a ip forwarding pkt. * Set the same hash value so host can send on the * cpu it was received. */ *pi_data = m_head->m_pkthdr.flowid; else /* * Otherwise just put the tx queue index. */ *pi_data = txr->hn_tx_idx; } if (m_head->m_flags & M_VLANTAG) { pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); *pi_data = NDIS_VLAN_INFO_MAKE( EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); } if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { #if defined(INET6) || defined(INET) pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { *pi_data = NDIS_LSO2_INFO_MAKEIPV4( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, m_head->m_pkthdr.tso_segsz); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { *pi_data = NDIS_LSO2_INFO_MAKEIPV6( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, m_head->m_pkthdr.tso_segsz); } #endif #endif /* INET6 || INET */ } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); if (m_head->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP)) { *pi_data = NDIS_TXCSUM_INFO_IPV6; } else { *pi_data = NDIS_TXCSUM_INFO_IPV4; if (m_head->m_pkthdr.csum_flags & CSUM_IP) *pi_data |= NDIS_TXCSUM_INFO_IPCS; } if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) { *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); } else if (m_head->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP6_UDP)) { *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); } } pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; /* Fixup RNDIS packet message total length */ pkt->rm_len += pkt_hlen; /* Convert RNDIS packet message offsets */ pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); /* * Fast path: Chimney sending. */ if (chim != NULL) { struct hn_txdesc *tgt_txd = txd; if (txr->hn_agg_txd != NULL) { tgt_txd = txr->hn_agg_txd; #ifdef INVARIANTS *m_head0 = NULL; #endif } KASSERT(pkt == chim, ("RNDIS pkt not in chimney sending buffer")); KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, ("chimney sending buffer is not used")); tgt_txd->chim_size += pkt->rm_len; m_copydata(m_head, 0, m_head->m_pkthdr.len, ((uint8_t *)chim) + pkt_hlen); txr->hn_gpa_cnt = 0; txr->hn_sendpkt = hn_txpkt_chim; goto done; } KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("chimney buffer is used")); KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); if (__predict_false(error)) { int freed __diagused; /* * This mbuf is not linked w/ the txd yet, so free it now. */ m_freem(m_head); *m_head0 = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon txdma error")); txr->hn_txdma_failed++; if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return error; } *m_head0 = m_head; /* +1 RNDIS packet message */ txr->hn_gpa_cnt = nsegs + 1; /* send packet with page buffer */ txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; txr->hn_gpa[0].gpa_len = pkt_hlen; /* * Fill the page buffers with mbuf info after the page * buffer for RNDIS packet message. */ for (i = 0; i < nsegs; ++i) { struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; gpa->gpa_page = atop(segs[i].ds_addr); gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; gpa->gpa_len = segs[i].ds_len; } txd->chim_index = HN_NVS_CHIM_IDX_INVALID; txd->chim_size = 0; txr->hn_sendpkt = hn_txpkt_sglist; done: txd->m = m_head; /* Set the completion routine */ hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); /* Update temporary stats for later use. */ txr->hn_stat_pkts++; txr->hn_stat_size += m_head->m_pkthdr.len; if (m_head->m_flags & M_MCAST) txr->hn_stat_mcasts++; return 0; } /* * NOTE: * If this function fails, then txd will be freed, but the mbuf * associated w/ the txd will _not_ be freed. */ static int hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) { int error, send_failed = 0, has_bpf; again: - has_bpf = bpf_peers_present(if_getbpf(ifp)); + has_bpf = bpf_peers_present_if(ifp); if (has_bpf) { /* * Make sure that this txd and any aggregated txds are not * freed before ETHER_BPF_MTAP. */ hn_txdesc_hold(txd); } error = txr->hn_sendpkt(txr, txd); if (!error) { if (has_bpf) { const struct hn_txdesc *tmp_txd; ETHER_BPF_MTAP(ifp, txd->m); STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) ETHER_BPF_MTAP(ifp, tmp_txd->m); } if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { if_inc_counter(ifp, IFCOUNTER_OBYTES, txr->hn_stat_size); if (txr->hn_stat_mcasts != 0) { if_inc_counter(ifp, IFCOUNTER_OMCASTS, txr->hn_stat_mcasts); } } txr->hn_pkts += txr->hn_stat_pkts; txr->hn_sends++; } if (has_bpf) hn_txdesc_put(txr, txd); if (__predict_false(error)) { int freed __diagused; /* * This should "really rarely" happen. * * XXX Too many RX to be acked or too many sideband * commands to run? Ask netvsc_channel_rollup() * to kick start later. */ txr->hn_has_txeof = 1; if (!send_failed) { txr->hn_send_failed++; send_failed = 1; /* * Try sending again after set hn_has_txeof; * in case that we missed the last * netvsc_channel_rollup(). */ goto again; } if_printf(ifp, "send failed\n"); /* * Caller will perform further processing on the * associated mbuf, so don't free it in hn_txdesc_put(); * only unload it from the DMA map in hn_txdesc_put(), * if it was loaded. */ txd->m = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon send error")); txr->hn_send_failed++; } /* Reset temporary stats, after this sending is done. */ txr->hn_stat_size = 0; txr->hn_stat_pkts = 0; txr->hn_stat_mcasts = 0; return (error); } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. * There should be an equivalent in the kernel mbuf code, * but there does not appear to be one yet. * * Differs from m_append() in that additional mbufs are * allocated with cluster size MJUMPAGESIZE, and filled * accordingly. * * Return the last mbuf in the chain or NULL if failed to * allocate new mbuf. */ static struct mbuf * hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space; remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); if (n == NULL) return NULL; n->m_len = min(MJUMPAGESIZE, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len; remainder -= n->m_len; m->m_next = n; m = n; } return m; } #if defined(INET) || defined(INET6) static __inline int hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) { if (hn_lro_mbufq_depth) { tcp_lro_queue_mbuf(lc, m); return 0; } return tcp_lro_rx(lc, m, 0); } #endif static int hn_rxpkt(struct hn_rx_ring *rxr) { if_t ifp, hn_ifp = rxr->hn_ifp; struct mbuf *m_new, *n; int size, do_lro = 0, do_csum = 1, is_vf = 0; int hash_type = M_HASHTYPE_NONE; int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; int i; ifp = hn_ifp; if (rxr->hn_rxvf_ifp != NULL) { /* * Non-transparent mode VF; pretend this packet is from * the VF. */ ifp = rxr->hn_rxvf_ifp; is_vf = 1; } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { /* Transparent mode VF. */ is_vf = 1; } if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { /* * NOTE: * See the NOTE of hn_rndis_init_fixat(). This * function can be reached, immediately after the * RNDIS is initialized but before the ifnet is * setup on the hn_attach() path; drop the unexpected * packets. */ return (0); } if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); return (0); } if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { m_new = m_gethdr(M_NOWAIT, MT_DATA); if (m_new == NULL) { if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); return (0); } memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], rxr->rsc.frag_len[0]); m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; } else { /* * Get an mbuf with a cluster. For packets 2K or less, * get a standard 2K cluster. For anything larger, get a * 4K cluster. Any buffers larger than 4K can cause problems * if looped around to the Hyper-V TX channel, so avoid them. */ size = MCLBYTES; if (rxr->rsc.pktlen > MCLBYTES) { /* 4096 */ size = MJUMPAGESIZE; } m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); if (m_new == NULL) { if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); return (0); } n = m_new; for (i = 0; i < rxr->rsc.cnt; i++) { n = hv_m_append(n, rxr->rsc.frag_len[i], rxr->rsc.frag_data[i]); if (n == NULL) { if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); return (0); } else { m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; } } } if (rxr->rsc.pktlen <= MHLEN) rxr->hn_small_pkts++; m_new->m_pkthdr.rcvif = ifp; if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0)) do_csum = 0; /* receive side checksum offload */ if (rxr->rsc.csum_info != NULL) { /* IP csum offload */ if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); rxr->hn_csum_ip++; } /* TCP/UDP csum offload */ if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) rxr->hn_csum_tcp++; else rxr->hn_csum_udp++; } /* * XXX * As of this write (Oct 28th, 2016), host side will turn * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so * the do_lro setting here is actually _not_ accurate. We * depend on the RSS hash type check to reset do_lro. */ if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) do_lro = 1; } else { hn_rxpkt_proto(m_new, &l3proto, &l4proto); if (l3proto == ETHERTYPE_IP) { if (l4proto == IPPROTO_TCP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_TCP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } do_lro = 1; } else if (l4proto == IPPROTO_UDP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_UDP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } } else if (l4proto != IPPROTO_DONE && do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); } } } if (rxr->rsc.vlan_info != NULL) { m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); m_new->m_flags |= M_VLANTAG; } /* * If VF is activated (tranparent/non-transparent mode does not * matter here). * * - Disable LRO * * hn(4) will only receive broadcast packets, multicast packets, * TCP SYN and SYN|ACK (in Azure), LRO is useless for these * packet types. * * For non-transparent, we definitely _cannot_ enable LRO at * all, since the LRO flush will use hn(4) as the receiving * interface; i.e. hn_ifp->if_input(hn_ifp, m). */ if (is_vf) do_lro = 0; /* * If VF is activated (tranparent/non-transparent mode does not * matter here), do _not_ mess with unsupported hash types or * functions. */ if (rxr->rsc.hash_info != NULL) { rxr->hn_rss_pkts++; m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); if (!is_vf) hash_type = M_HASHTYPE_OPAQUE_HASH; if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == NDIS_HASH_FUNCTION_TOEPLITZ) { uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & rxr->hn_mbuf_hash); /* * NOTE: * do_lro is resetted, if the hash types are not TCP * related. See the comment in the above csum_flags * setup section. */ switch (type) { case NDIS_HASH_IPV4: hash_type = M_HASHTYPE_RSS_IPV4; do_lro = 0; break; case NDIS_HASH_TCP_IPV4: hash_type = M_HASHTYPE_RSS_TCP_IPV4; if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { int def_htype = M_HASHTYPE_OPAQUE_HASH; if (is_vf) def_htype = M_HASHTYPE_NONE; /* * UDP 4-tuple hash is delivered as * TCP 4-tuple hash. */ if (l3proto == ETHERTYPE_MAX) { hn_rxpkt_proto(m_new, &l3proto, &l4proto); } if (l3proto == ETHERTYPE_IP) { if (l4proto == IPPROTO_UDP && (rxr->hn_mbuf_hash & NDIS_HASH_UDP_IPV4_X)) { hash_type = M_HASHTYPE_RSS_UDP_IPV4; do_lro = 0; } else if (l4proto != IPPROTO_TCP) { hash_type = def_htype; do_lro = 0; } } else { hash_type = def_htype; do_lro = 0; } } break; case NDIS_HASH_IPV6: hash_type = M_HASHTYPE_RSS_IPV6; do_lro = 0; break; case NDIS_HASH_IPV6_EX: hash_type = M_HASHTYPE_RSS_IPV6_EX; do_lro = 0; break; case NDIS_HASH_TCP_IPV6: hash_type = M_HASHTYPE_RSS_TCP_IPV6; break; case NDIS_HASH_TCP_IPV6_EX: hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; break; } } } else if (!is_vf) { m_new->m_pkthdr.flowid = rxr->hn_rx_idx; hash_type = M_HASHTYPE_OPAQUE; } M_HASHTYPE_SET(m_new, hash_type); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if (hn_ifp != ifp) { const struct ether_header *eh; /* * Non-transparent mode VF is activated. */ /* * Allow tapping on hn(4). */ ETHER_BPF_MTAP(hn_ifp, m_new); /* * Update hn(4)'s stats. */ if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); /* Checked at the beginning of this function. */ KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); eh = mtod(m_new, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); } rxr->hn_pkts++; if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) { #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxr->hn_lro; if (lro->lro_cnt) { rxr->hn_lro_tried++; if (hn_lro_rx(lro, m_new) == 0) { /* DONE! */ return 0; } } #endif } if_input(ifp, m_new); return (0); } static int hn_ioctl(if_t ifp, u_long cmd, caddr_t data) { struct hn_softc *sc = if_getsoftc(ifp); struct ifreq *ifr = (struct ifreq *)data, ifr_vf; if_t vf_ifp; int mask, error = 0; struct ifrsskey *ifrk; struct ifrsshash *ifrh; uint32_t mtu; switch (cmd) { case SIOCSIFMTU: if (ifr->ifr_mtu > HN_MTU_MAX) { error = EINVAL; break; } HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if ((sc->hn_caps & HN_CAP_MTU) == 0) { /* Can't change MTU */ HN_UNLOCK(sc); error = EOPNOTSUPP; break; } if (if_getmtu(ifp) == ifr->ifr_mtu) { HN_UNLOCK(sc); break; } if (hn_xpnt_vf_isready(sc)) { vf_ifp = sc->hn_vf_ifp; ifr_vf = *ifr; strlcpy(ifr_vf.ifr_name, if_name(vf_ifp), sizeof(ifr_vf.ifr_name)); error = ifhwioctl(SIOCSIFMTU,vf_ifp, (caddr_t)&ifr_vf, curthread); if (error) { HN_UNLOCK(sc); if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", if_name(vf_ifp), ifr->ifr_mtu, error); break; } } /* * Suspend this interface before the synthetic parts * are ripped. */ hn_suspend(sc); /* * Detach the synthetics parts, i.e. NVS and RNDIS. */ hn_synth_detach(sc); /* * Reattach the synthetic parts, i.e. NVS and RNDIS, * with the new MTU setting. */ error = hn_synth_attach(sc, ifr->ifr_mtu); if (error) { HN_UNLOCK(sc); break; } error = hn_rndis_get_mtu(sc, &mtu); if (error) mtu = ifr->ifr_mtu; else if (bootverbose) if_printf(ifp, "RNDIS mtu %u\n", mtu); /* * Commit the requested MTU, after the synthetic parts * have been successfully attached. */ if (mtu >= ifr->ifr_mtu) { mtu = ifr->ifr_mtu; } else { if_printf(ifp, "fixup mtu %d -> %u\n", ifr->ifr_mtu, mtu); } if_setmtu(ifp, mtu); /* * Synthetic parts' reattach may change the chimney * sending size; update it. */ if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) hn_set_chim_size(sc, sc->hn_chim_szmax); /* * Make sure that various parameters based on MTU are * still valid, after the MTU change. */ hn_mtu_change_fixup(sc); /* * All done! Resume the interface now. */ hn_resume(sc); if ((sc->hn_flags & HN_FLAG_RXVF) || (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { /* * Since we have reattached the NVS part, * change the datapath to VF again; in case * that it is lost, after the NVS was detached. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); } HN_UNLOCK(sc); break; case SIOCSIFFLAGS: HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if (hn_xpnt_vf_isready(sc)) hn_xpnt_vf_saveifflags(sc); if (if_getflags(ifp) & IFF_UP) { if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { /* * Caller meight hold mutex, e.g. * bpf; use busy-wait for the RNDIS * reply. */ HN_NO_SLEEPING(sc); hn_rxfilter_config(sc); HN_SLEEPING_OK(sc); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) error = hn_xpnt_vf_iocsetflags(sc); } else { hn_init_locked(sc); } } else { if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) hn_stop(sc, false); } sc->hn_if_flags = if_getflags(ifp); HN_UNLOCK(sc); break; case SIOCSIFCAP: HN_LOCK(sc); if (hn_xpnt_vf_isready(sc)) { ifr_vf = *ifr; strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp), sizeof(ifr_vf.ifr_name)); error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); HN_UNLOCK(sc); break; } /* * Fix up requested capabilities w/ supported capabilities, * since the supported capabilities could have been changed. */ mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^ if_getcapenable(ifp); if (mask & IFCAP_TXCSUM) { if_togglecapenable(ifp, IFCAP_TXCSUM); if (if_getcapenable(ifp) & IFCAP_TXCSUM) if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0); else if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc)); } if (mask & IFCAP_TXCSUM_IPV6) { if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6); if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0); else if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc)); } /* TODO: flip RNDIS offload parameters for RXCSUM. */ if (mask & IFCAP_RXCSUM) if_togglecapenable(ifp, IFCAP_RXCSUM); #ifdef foo /* We can't diff IPv6 packets from IPv4 packets on RX path. */ if (mask & IFCAP_RXCSUM_IPV6) if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6); #endif if (mask & IFCAP_LRO) if_togglecapenable(ifp, IFCAP_LRO); if (mask & IFCAP_TSO4) { if_togglecapenable(ifp, IFCAP_TSO4); if (if_getcapenable(ifp) & IFCAP_TSO4) if_sethwassistbits(ifp, CSUM_IP_TSO, 0); else if_sethwassistbits(ifp, 0, CSUM_IP_TSO); } if (mask & IFCAP_TSO6) { if_togglecapenable(ifp, IFCAP_TSO6); if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); else if_sethwassistbits(ifp, 0, CSUM_IP6_TSO); } HN_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { /* * Multicast uses mutex; use busy-wait for * the RNDIS reply. */ HN_NO_SLEEPING(sc); hn_rxfilter_config(sc); HN_SLEEPING_OK(sc); } /* XXX vlan(4) style mcast addr maintenance */ if (hn_xpnt_vf_isready(sc)) { int old_if_flags; old_if_flags = if_getflags(sc->hn_vf_ifp); hn_xpnt_vf_saveifflags(sc); if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) & IFF_ALLMULTI)) error = hn_xpnt_vf_iocsetflags(sc); } HN_UNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: HN_LOCK(sc); if (hn_xpnt_vf_isready(sc)) { /* * SIOCGIFMEDIA expects ifmediareq, so don't * create and pass ifr_vf to the VF here; just * replace the ifr_name. */ vf_ifp = sc->hn_vf_ifp; strlcpy(ifr->ifr_name, if_name(vf_ifp), sizeof(ifr->ifr_name)); error = ifhwioctl(cmd, vf_ifp, data, curthread); /* Restore the ifr_name. */ strlcpy(ifr->ifr_name, if_name(ifp), sizeof(ifr->ifr_name)); HN_UNLOCK(sc); break; } HN_UNLOCK(sc); error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); break; case SIOCGIFRSSHASH: ifrh = (struct ifrsshash *)data; HN_LOCK(sc); if (sc->hn_rx_ring_inuse == 1) { HN_UNLOCK(sc); ifrh->ifrh_func = RSS_FUNC_NONE; ifrh->ifrh_types = 0; break; } if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; else ifrh->ifrh_func = RSS_FUNC_PRIVATE; ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); HN_UNLOCK(sc); break; case SIOCGIFRSSKEY: ifrk = (struct ifrsskey *)data; HN_LOCK(sc); if (sc->hn_rx_ring_inuse == 1) { HN_UNLOCK(sc); ifrk->ifrk_func = RSS_FUNC_NONE; ifrk->ifrk_keylen = 0; break; } if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; else ifrk->ifrk_func = RSS_FUNC_PRIVATE; ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, NDIS_HASH_KEYSIZE_TOEPLITZ); HN_UNLOCK(sc); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } static void hn_stop(struct hn_softc *sc, bool detaching) { if_t ifp = sc->hn_ifp; int i; HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); /* Clear RUNNING bit ASAP. */ if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); /* Disable polling. */ hn_polling(sc, 0); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { KASSERT(sc->hn_vf_ifp != NULL, ("%s: VF is not attached", if_name(ifp))); /* Mark transparent mode VF as disabled. */ hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); /* * NOTE: * Datapath setting must happen _before_ bringing * the VF down. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); /* * Bring the VF down. */ hn_xpnt_vf_saveifflags(sc); if_setflagbits(ifp, 0, IFF_UP); hn_xpnt_vf_iocsetflags(sc); } /* Suspend data transfers. */ hn_suspend_data(sc); /* Clear OACTIVE bit. */ if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; /* * If the non-transparent mode VF is active, make sure * that the RX filter still allows packet reception. */ if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) hn_rxfilter_config(sc); } static void hn_init_locked(struct hn_softc *sc) { if_t ifp = sc->hn_ifp; int i; HN_LOCK_ASSERT(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return; if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) return; /* Configure RX filter */ hn_rxfilter_config(sc); /* Clear OACTIVE bit. */ if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; /* Clear TX 'suspended' bit. */ hn_resume_tx(sc, sc->hn_tx_ring_inuse); if (hn_xpnt_vf_isready(sc)) { /* Initialize transparent VF. */ hn_xpnt_vf_init(sc); } /* Everything is ready; unleash! */ if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); /* Re-enable polling if requested. */ if (sc->hn_pollhz > 0) hn_polling(sc, sc->hn_pollhz); } static void hn_init(void *xsc) { struct hn_softc *sc = xsc; HN_LOCK(sc); hn_init_locked(sc); HN_UNLOCK(sc); } static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int lenlim; int error; lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; error = sysctl_handle_int(oidp, &lenlim, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || lenlim > TCP_LRO_LENGTH_MAX) { HN_UNLOCK(sc); return EINVAL; } hn_set_lro_lenlim(sc, lenlim); HN_UNLOCK(sc); return 0; } static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ackcnt, error, i; /* * lro_ackcnt_lim is append count limit, * +1 to turn it into aggregation limit. */ ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; error = sysctl_handle_int(oidp, &ackcnt, 0, req); if (error || req->newptr == NULL) return error; if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) return EINVAL; /* * Convert aggregation limit back to append * count limit. */ --ackcnt; HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; HN_UNLOCK(sc); return 0; } static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int hcsum = arg2; int on, error, i; on = 0; if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) on = 1; error = sysctl_handle_int(oidp, &on, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (on) rxr->hn_trust_hcsum |= hcsum; else rxr->hn_trust_hcsum &= ~hcsum; } HN_UNLOCK(sc); return 0; } static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int chim_size, error; chim_size = sc->hn_tx_ring[0].hn_chim_size; error = sysctl_handle_int(oidp, &chim_size, 0, req); if (error || req->newptr == NULL) return error; if (chim_size > sc->hn_chim_szmax || chim_size <= 0) return EINVAL; HN_LOCK(sc); hn_set_chim_size(sc, chim_size); HN_UNLOCK(sc); return 0; } static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; uint64_t stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((uint64_t *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_64(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; } return 0; } static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; u_long stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((u_long *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((u_long *)((uint8_t *)rxr + ofs)) = 0; } return 0; } static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_tx_ring *txr; u_long stat; stat = 0; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; stat += *((u_long *)((uint8_t *)txr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; *((u_long *)((uint8_t *)txr + ofs)) = 0; } return 0; } static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error, conf; struct hn_tx_ring *txr; txr = &sc->hn_tx_ring[0]; conf = *((int *)((uint8_t *)txr + ofs)); error = sysctl_handle_int(oidp, &conf, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; *((int *)((uint8_t *)txr + ofs)) = conf; } HN_UNLOCK(sc); return 0; } static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, size; size = sc->hn_agg_size; error = sysctl_handle_int(oidp, &size, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); sc->hn_agg_size = size; hn_set_txagg(sc); HN_UNLOCK(sc); return (0); } static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, pkts; pkts = sc->hn_agg_pkts; error = sysctl_handle_int(oidp, &pkts, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); sc->hn_agg_pkts = pkts; hn_set_txagg(sc); HN_UNLOCK(sc); return (0); } static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int pkts; pkts = sc->hn_tx_ring[0].hn_agg_pktmax; return (sysctl_handle_int(oidp, &pkts, 0, req)); } static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int align; align = sc->hn_tx_ring[0].hn_agg_align; return (sysctl_handle_int(oidp, &align, 0, req)); } static void hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) { if (pollhz == 0) vmbus_chan_poll_disable(chan); else vmbus_chan_poll_enable(chan, pollhz); } static void hn_polling(struct hn_softc *sc, u_int pollhz) { int nsubch = sc->hn_rx_ring_inuse - 1; HN_LOCK_ASSERT(sc); if (nsubch > 0) { struct vmbus_channel **subch; int i; subch = vmbus_subchan_get(sc->hn_prichan, nsubch); for (i = 0; i < nsubch; ++i) hn_chan_polling(subch[i], pollhz); vmbus_subchan_rel(subch, nsubch); } hn_chan_polling(sc->hn_prichan, pollhz); } static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int pollhz, error; pollhz = sc->hn_pollhz; error = sysctl_handle_int(oidp, &pollhz, 0, req); if (error || req->newptr == NULL) return (error); if (pollhz != 0 && (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) return (EINVAL); HN_LOCK(sc); if (sc->hn_pollhz != pollhz) { sc->hn_pollhz = pollhz; if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) hn_polling(sc, sc->hn_pollhz); } HN_UNLOCK(sc); return (0); } static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char verstr[16]; snprintf(verstr, sizeof(verstr), "%u.%u", HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); } static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char caps_str[128]; uint32_t caps; HN_LOCK(sc); caps = sc->hn_caps; HN_UNLOCK(sc); snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); } static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char assist_str[128]; uint32_t hwassist; HN_LOCK(sc); hwassist = if_gethwassist(sc->hn_ifp); HN_UNLOCK(sc); snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); } static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char filter_str[128]; uint32_t filter; HN_LOCK(sc); filter = sc->hn_rx_filter; HN_UNLOCK(sc); snprintf(filter_str, sizeof(filter_str), "%b", filter, NDIS_PACKET_TYPES); return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); } static int hn_rsc_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; uint32_t mtu; int error; HN_LOCK(sc); error = hn_rndis_get_mtu(sc, &mtu); if (error) { if_printf(sc->hn_ifp, "failed to get mtu\n"); goto back; } error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); if (error || req->newptr == NULL) goto back; error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); if (error) goto back; error = hn_rndis_reconf_offload(sc, mtu); back: HN_UNLOCK(sc); return (error); } #ifndef RSS static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error; HN_LOCK(sc); error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); if (error || req->newptr == NULL) goto back; if ((sc->hn_flags & HN_FLAG_RXVF) || (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { /* * RSS key is synchronized w/ VF's, don't allow users * to change it. */ error = EBUSY; goto back; } error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); if (error) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSKEY; if (sc->hn_rx_ring_inuse > 1) { error = hn_rss_reconfig(sc); } else { /* Not RSS capable, at least for now; just save the RSS key. */ error = 0; } back: HN_UNLOCK(sc); return (error); } static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error; HN_LOCK(sc); error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); if (error || req->newptr == NULL) goto back; /* * Don't allow RSS indirect table change, if this interface is not * RSS capable currently. */ if (sc->hn_rx_ring_inuse == 1) { error = EOPNOTSUPP; goto back; } error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); if (error) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSIND; hn_rss_ind_fixup(sc); error = hn_rss_reconfig(sc); back: HN_UNLOCK(sc); return (error); } #endif /* !RSS */ static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char hash_str[128]; uint32_t hash; HN_LOCK(sc); hash = sc->hn_rss_hash; HN_UNLOCK(sc); snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); } static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char hash_str[128]; uint32_t hash; HN_LOCK(sc); hash = sc->hn_rss_hcap; HN_UNLOCK(sc); snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); } static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char hash_str[128]; uint32_t hash; HN_LOCK(sc); hash = sc->hn_rx_ring[0].hn_mbuf_hash; HN_UNLOCK(sc); snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); } static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char vf_name[IFNAMSIZ + 1]; if_t vf_ifp; HN_LOCK(sc); vf_name[0] = '\0'; vf_ifp = sc->hn_vf_ifp; if (vf_ifp != NULL) snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp)); HN_UNLOCK(sc); return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); } static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char vf_name[IFNAMSIZ + 1]; if_t vf_ifp; HN_LOCK(sc); vf_name[0] = '\0'; vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; if (vf_ifp != NULL) snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp)); HN_UNLOCK(sc); return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); } static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) { struct rm_priotracker pt; struct sbuf *sb; int error, i; bool first; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); rm_rlock(&hn_vfmap_lock, &pt); first = true; for (i = 0; i < hn_vfmap_size; ++i) { struct epoch_tracker et; if_t ifp; if (hn_vfmap[i] == NULL) continue; NET_EPOCH_ENTER(et); ifp = ifnet_byindex(i); if (ifp != NULL) { if (first) sbuf_printf(sb, "%s", if_name(ifp)); else sbuf_printf(sb, " %s", if_name(ifp)); first = false; } NET_EPOCH_EXIT(et); } rm_runlock(&hn_vfmap_lock, &pt); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) { struct rm_priotracker pt; struct sbuf *sb; int error, i; bool first; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); rm_rlock(&hn_vfmap_lock, &pt); first = true; for (i = 0; i < hn_vfmap_size; ++i) { struct epoch_tracker et; if_t ifp, hn_ifp; hn_ifp = hn_vfmap[i]; if (hn_ifp == NULL) continue; NET_EPOCH_ENTER(et); ifp = ifnet_byindex(i); if (ifp != NULL) { if (first) { sbuf_printf(sb, "%s:%s", if_name(ifp), if_name(hn_ifp)); } else { sbuf_printf(sb, " %s:%s", if_name(ifp), if_name(hn_ifp)); } first = false; } NET_EPOCH_EXIT(et); } rm_runlock(&hn_vfmap_lock, &pt); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, onoff = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) onoff = 1; error = sysctl_handle_int(oidp, &onoff, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); /* NOTE: hn_vf_lock for hn_transmit() */ rm_wlock(&sc->hn_vf_lock); if (onoff) sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; else sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; rm_wunlock(&sc->hn_vf_lock); HN_UNLOCK(sc); return (0); } static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int enabled = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) enabled = 1; return (sysctl_handle_int(oidp, &enabled, 0, req)); } static int hn_check_iplen(const struct mbuf *m, int hoff) { const struct ip *ip; int len, iphlen, iplen; const struct tcphdr *th; int thoff; /* TCP data offset */ len = hoff + sizeof(struct ip); /* The packet must be at least the size of an IP header. */ if (m->m_pkthdr.len < len) return IPPROTO_DONE; /* The fixed IP header must reside completely in the first mbuf. */ if (m->m_len < len) return IPPROTO_DONE; ip = mtodo(m, hoff); /* Bound check the packet's stated IP header length. */ iphlen = ip->ip_hl << 2; if (iphlen < sizeof(struct ip)) /* minimum header length */ return IPPROTO_DONE; /* The full IP header must reside completely in the one mbuf. */ if (m->m_len < hoff + iphlen) return IPPROTO_DONE; iplen = ntohs(ip->ip_len); /* * Check that the amount of data in the buffers is as * at least much as the IP header would have us expect. */ if (m->m_pkthdr.len < hoff + iplen) return IPPROTO_DONE; /* * Ignore IP fragments. */ if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) return IPPROTO_DONE; /* * The TCP/IP or UDP/IP header must be entirely contained within * the first fragment of a packet. */ switch (ip->ip_p) { case IPPROTO_TCP: if (iplen < iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); thoff = th->th_off << 2; if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + thoff) return IPPROTO_DONE; break; case IPPROTO_UDP: if (iplen < iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; break; default: if (iplen < iphlen) return IPPROTO_DONE; break; } return ip->ip_p; } static void hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) { const struct ether_header *eh; uint16_t etype; int hoff; hoff = sizeof(*eh); /* Checked at the beginning of this function. */ KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); eh = mtod(m_new, const struct ether_header *); etype = ntohs(eh->ether_type); if (etype == ETHERTYPE_VLAN) { const struct ether_vlan_header *evl; hoff = sizeof(*evl); if (m_new->m_len < hoff) return; evl = mtod(m_new, const struct ether_vlan_header *); etype = ntohs(evl->evl_proto); } *l3proto = etype; if (etype == ETHERTYPE_IP) *l4proto = hn_check_iplen(m_new, hoff); else *l4proto = IPPROTO_DONE; } static int hn_create_rx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; device_t dev = sc->hn_dev; #if defined(INET) || defined(INET6) int lroent_cnt; #endif int i; /* * Create RXBUF for reception. * * NOTE: * - It is shared by all channels. * - A large enough buffer is allocated, certain version of NVSes * may further limit the usable space. */ sc->hn_rxbuf = contigmalloc(HN_RXBUF_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0); if (sc->hn_rxbuf == NULL) { device_printf(sc->hn_dev, "allocate rxbuf failed\n"); return (ENOMEM); } sc->hn_rx_ring_cnt = ring_cnt; sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, M_DEVBUF, M_WAITOK | M_ZERO); #if defined(INET) || defined(INET6) lroent_cnt = hn_lro_entry_count; if (lroent_cnt < TCP_LRO_ENTRIES) lroent_cnt = TCP_LRO_ENTRIES; if (bootverbose) device_printf(dev, "LRO: entry count %d\n", lroent_cnt); #endif /* INET || INET6 */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); /* Create dev.hn.UNIT.rx sysctl tree */ sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; rxr->hn_br = contigmalloc(HN_TXBR_SIZE + HN_RXBR_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0); if (rxr->hn_br == NULL) { device_printf(dev, "allocate bufring failed\n"); return (ENOMEM); } if (hn_trust_hosttcp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; if (hn_trust_hostudp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; if (hn_trust_hostip) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; rxr->hn_mbuf_hash = NDIS_HASH_ALL; rxr->hn_ifp = sc->hn_ifp; if (i < sc->hn_tx_ring_cnt) rxr->hn_txr = &sc->hn_tx_ring[i]; rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); rxr->hn_rx_idx = i; rxr->hn_rxbuf = sc->hn_rxbuf; /* * Initialize LRO. */ #if defined(INET) || defined(INET6) tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, hn_lro_mbufq_depth); rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; #endif /* INET || INET6 */ if (sc->hn_rx_sysctl_tree != NULL) { char name[16]; /* * Create per RX ring sysctl tree: * dev.hn.UNIT.rx.RINGID */ snprintf(name, sizeof(name), "%d", i); rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (rxr->hn_rx_sysctl_tree != NULL) { SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "packets", CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts, "# of packets received"); SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "rss_pkts", CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_rss_pkts, "# of packets w/ RSS info received"); SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "rsc_pkts", CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_rsc_pkts, "# of RSC packets received"); SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "rsc_drop", CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_rsc_drop, "# of RSC fragments dropped"); SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "pktbuf_len", CTLFLAG_RD, &rxr->hn_pktbuf_len, 0, "Temporary channel packet buffer length"); } } } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_lro.lro_queued), hn_rx_stat_u64_sysctl, "LU", "LRO queued"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), hn_rx_stat_u64_sysctl, "LU", "LRO flushed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_lro_tried), hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_lenlim_sysctl, "IU", "Max # of data bytes to be aggregated by LRO"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_ackcnt_sysctl, "I", "Max # of ACKs to be aggregated by LRO"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, hn_trust_hcsum_sysctl, "I", "Trust tcp segment verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, hn_trust_hcsum_sysctl, "I", "Trust udp datagram verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, hn_trust_hcsum_sysctl, "I", "Trust ip packet verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_csum_ip), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_csum_tcp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_csum_udp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_trusted), hn_rx_stat_ulong_sysctl, "LU", "# of packets that we trust host's csum verification"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_small_pkts), hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_ack_failed), hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); return (0); } static void hn_destroy_rx_data(struct hn_softc *sc) { int i; if (sc->hn_rxbuf != NULL) { if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) contigfree(sc->hn_rxbuf, HN_RXBUF_SIZE, M_DEVBUF); else device_printf(sc->hn_dev, "RXBUF is referenced\n"); sc->hn_rxbuf = NULL; } if (sc->hn_rx_ring_cnt == 0) return; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (rxr->hn_br == NULL) continue; if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { contigfree(rxr->hn_br, HN_TXBR_SIZE + HN_RXBR_SIZE, M_DEVBUF); } else { device_printf(sc->hn_dev, "%dth channel bufring is referenced", i); } rxr->hn_br = NULL; #if defined(INET) || defined(INET6) tcp_lro_free(&rxr->hn_lro); #endif free(rxr->hn_pktbuf, M_DEVBUF); } free(sc->hn_rx_ring, M_DEVBUF); sc->hn_rx_ring = NULL; sc->hn_rx_ring_cnt = 0; sc->hn_rx_ring_inuse = 0; } static int hn_tx_ring_create(struct hn_softc *sc, int id) { struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; device_t dev = sc->hn_dev; bus_dma_tag_t parent_dtag; int error, i; txr->hn_sc = sc; txr->hn_tx_idx = id; #ifndef HN_USE_TXDESC_BUFRING mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); #endif mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); txr->hn_txdesc_cnt = HN_TX_DESC_CNT; txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, M_DEVBUF, M_WAITOK | M_ZERO); #ifndef HN_USE_TXDESC_BUFRING SLIST_INIT(&txr->hn_txlist); #else txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, M_WAITOK, &txr->hn_tx_lock); #endif if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); } else { txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; } #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { txr->hn_txeof = hn_start_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); } else #endif { int br_depth; txr->hn_txeof = hn_xmit_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); br_depth = hn_get_txswq_depth(txr); txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, M_WAITOK, &txr->hn_tx_lock); } txr->hn_direct_tx_size = hn_direct_tx_size; /* * Always schedule transmission instead of trying to do direct * transmission. This one gives the best performance so far. */ txr->hn_sched_tx = 1; parent_dtag = bus_get_dma_tag(dev); /* DMA tag for RNDIS packet messages. */ error = bus_dma_tag_create(parent_dtag, /* parent */ HN_RNDIS_PKT_ALIGN, /* alignment */ HN_RNDIS_PKT_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_RNDIS_PKT_LEN, /* maxsize */ 1, /* nsegments */ HN_RNDIS_PKT_LEN, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_rndis_dtag); if (error) { device_printf(dev, "failed to create rndis dmatag\n"); return error; } /* DMA tag for data. */ error = bus_dma_tag_create(parent_dtag, /* parent */ 1, /* alignment */ HN_TX_DATA_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_TX_DATA_MAXSIZE, /* maxsize */ HN_TX_DATA_SEGCNT_MAX, /* nsegments */ HN_TX_DATA_SEGSIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_data_dtag); if (error) { device_printf(dev, "failed to create data dmatag\n"); return error; } for (i = 0; i < txr->hn_txdesc_cnt; ++i) { struct hn_txdesc *txd = &txr->hn_txdesc[i]; txd->txr = txr; txd->chim_index = HN_NVS_CHIM_IDX_INVALID; STAILQ_INIT(&txd->agg_list); /* * Allocate and load RNDIS packet message. */ error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, (void **)&txd->rndis_pkt, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, &txd->rndis_pkt_dmap); if (error) { device_printf(dev, "failed to allocate rndis_packet_msg, %d\n", i); return error; } error = bus_dmamap_load(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap, txd->rndis_pkt, HN_RNDIS_PKT_LEN, hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, BUS_DMA_NOWAIT); if (error) { device_printf(dev, "failed to load rndis_packet_msg, %d\n", i); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } /* DMA map for TX data. */ error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, &txd->data_dmap); if (error) { device_printf(dev, "failed to allocate tx data dmamap\n"); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } /* All set, put it to list */ txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); #else buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif } txr->hn_txdesc_avail = txr->hn_txdesc_cnt; if (sc->hn_tx_sysctl_tree != NULL) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; char name[16]; /* * Create per TX ring sysctl tree: * dev.hn.UNIT.tx.RINGID */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); snprintf(name, sizeof(name), "%d", id); txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (txr->hn_tx_sysctl_tree != NULL) { child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); #ifdef HN_DEBUG SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", CTLFLAG_RD, &txr->hn_txdesc_avail, 0, "# of available TX descs"); #endif #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", CTLFLAG_RD, &txr->hn_oactive, 0, "over active"); } SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts, "# of packets transmitted"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends, "# of sends"); } } return 0; } static void hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) { struct hn_tx_ring *txr = txd->txr; KASSERT(txd->m == NULL, ("still has mbuf installed")); KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); } static void hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT(txd->refs == 0 || txd->refs == 1, ("invalid txd refs %d", txd->refs)); /* Aggregated txds will be freed by their aggregating txd. */ if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { int freed __diagused; freed = hn_txdesc_put(txr, txd); KASSERT(freed, ("can't free txdesc")); } } static void hn_tx_ring_destroy(struct hn_tx_ring *txr) { int i; if (txr->hn_txdesc == NULL) return; /* * NOTE: * Because the freeing of aggregated txds will be deferred * to the aggregating txd, two passes are used here: * - The first pass GCes any pending txds. This GC is necessary, * since if the channels are revoked, hypervisor will not * deliver send-done for all pending txds. * - The second pass frees the busdma stuffs, i.e. after all txds * were freed. */ for (i = 0; i < txr->hn_txdesc_cnt; ++i) hn_txdesc_gc(txr, &txr->hn_txdesc[i]); for (i = 0; i < txr->hn_txdesc_cnt; ++i) hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); if (txr->hn_tx_data_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_data_dtag); if (txr->hn_tx_rndis_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); #ifdef HN_USE_TXDESC_BUFRING buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); #endif free(txr->hn_txdesc, M_DEVBUF); txr->hn_txdesc = NULL; if (txr->hn_mbuf_br != NULL) buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); #ifndef HN_USE_TXDESC_BUFRING mtx_destroy(&txr->hn_txlist_spin); #endif mtx_destroy(&txr->hn_tx_lock); } static int hn_create_tx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; int i; /* * Create TXBUF for chimney sending. * * NOTE: It is shared by all channels. */ sc->hn_chim = contigmalloc(HN_CHIM_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0); if (sc->hn_chim == NULL) { device_printf(sc->hn_dev, "allocate txbuf failed\n"); return (ENOMEM); } sc->hn_tx_ring_cnt = ring_cnt; sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, M_DEVBUF, M_WAITOK | M_ZERO); ctx = device_get_sysctl_ctx(sc->hn_dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); /* Create dev.hn.UNIT.tx sysctl tree */ sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { int error; error = hn_tx_ring_create(sc, i); if (error) return error; } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, __offsetof(struct hn_tx_ring, hn_no_txdescs), hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, __offsetof(struct hn_tx_ring, hn_send_failed), hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, __offsetof(struct hn_tx_ring, hn_txdma_failed), hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, __offsetof(struct hn_tx_ring, hn_flush_failed), hn_tx_stat_ulong_sysctl, "LU", "# of packet transmission aggregation flush failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, __offsetof(struct hn_tx_ring, hn_tx_collapsed), hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, "# of total TX descs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", CTLFLAG_RD, &sc->hn_chim_szmax, 0, "Chimney send packet size upper boundary"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_chim_size_sysctl, "I", "Chimney send packet size limit"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_direct_tx_size), hn_tx_conf_int_sysctl, "I", "Size of the packet for direct transmission"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_sched_tx), hn_tx_conf_int_sysctl, "I", "Always schedule transmission " "instead of doing direct transmission"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, "Applied packet transmission aggregation size"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_txagg_pktmax_sysctl, "I", "Applied packet transmission aggregation packets"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_txagg_align_sysctl, "I", "Applied packet transmission aggregation alignment"); return 0; } static void hn_set_chim_size(struct hn_softc *sc, int chim_size) { int i; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_chim_size = chim_size; } static void hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) { if_t ifp = sc->hn_ifp; u_int hw_tsomax; int tso_minlen; HN_LOCK_ASSERT(sc); if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) return; KASSERT(sc->hn_ndis_tso_sgmin >= 2, ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); tso_minlen = sc->hn_ndis_tso_sgmin * mtu; KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && sc->hn_ndis_tso_szmax <= IP_MAXPACKET, ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); if (tso_maxlen < tso_minlen) tso_maxlen = tso_minlen; else if (tso_maxlen > IP_MAXPACKET) tso_maxlen = IP_MAXPACKET; if (tso_maxlen > sc->hn_ndis_tso_szmax) tso_maxlen = sc->hn_ndis_tso_szmax; hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); if (hn_xpnt_vf_isready(sc)) { if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp)) hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp); } if_sethwtsomax(ifp, hw_tsomax); if (bootverbose) if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp)); } static void hn_fixup_tx_data(struct hn_softc *sc) { uint64_t csum_assist; int i; hn_set_chim_size(sc, sc->hn_chim_szmax); if (hn_tx_chimney_size > 0 && hn_tx_chimney_size < sc->hn_chim_szmax) hn_set_chim_size(sc, hn_tx_chimney_size); csum_assist = 0; if (sc->hn_caps & HN_CAP_IPCS) csum_assist |= CSUM_IP; if (sc->hn_caps & HN_CAP_TCP4CS) csum_assist |= CSUM_IP_TCP; if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) csum_assist |= CSUM_IP_UDP; if (sc->hn_caps & HN_CAP_TCP6CS) csum_assist |= CSUM_IP6_TCP; if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) csum_assist |= CSUM_IP6_UDP; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_csum_assist = csum_assist; if (sc->hn_caps & HN_CAP_HASHVAL) { /* * Support HASHVAL pktinfo on TX path. */ if (bootverbose) if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; } } static void hn_fixup_rx_data(struct hn_softc *sc) { if (sc->hn_caps & HN_CAP_UDPHASH) { int i; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; } } static void hn_destroy_tx_data(struct hn_softc *sc) { int i; if (sc->hn_chim != NULL) { if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { contigfree(sc->hn_chim, HN_CHIM_SIZE, M_DEVBUF); } else { device_printf(sc->hn_dev, "chimney sending buffer is referenced"); } sc->hn_chim = NULL; } if (sc->hn_tx_ring_cnt == 0) return; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) hn_tx_ring_destroy(&sc->hn_tx_ring[i]); free(sc->hn_tx_ring, M_DEVBUF); sc->hn_tx_ring = NULL; sc->hn_tx_ring_cnt = 0; sc->hn_tx_ring_inuse = 0; } #ifdef HN_IFSTART_SUPPORT static void hn_start_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static int hn_start_locked(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; if_t ifp = sc->hn_ifp; int sched = 0; KASSERT(hn_use_if_start, ("hn_start_locked is called, when if_start is disabled")); KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); mtx_assert(&txr->hn_tx_lock, MA_OWNED); KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); if (__predict_false(txr->hn_suspended)) return (0); if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return (0); while (!if_sendq_empty(ifp)) { struct hn_txdesc *txd; struct mbuf *m_head; int error; m_head = if_dequeue(ifp); if (m_head == NULL) break; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ if_sendq_prepend(ifp, m_head); sched = 1; break; } #if defined(INET6) || defined(INET) if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { m_head = hn_tso_fixup(m_head); if (__predict_false(m_head == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); continue; } } else if (m_head->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { m_head = hn_set_hlen(m_head); if (__predict_false(m_head == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); continue; } } #endif txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; if_sendq_prepend(ifp, m_head); if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); break; } error = hn_encap(ifp, txr, txd, &m_head); if (error) { /* Both txd and m_head are freed */ KASSERT(txr->hn_agg_txd == NULL, ("encap failed w/ pending aggregating txdesc")); continue; } if (txr->hn_agg_pktleft == 0) { if (txr->hn_agg_txd != NULL) { KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); error = hn_flush_txagg(ifp, txr); if (__predict_false(error)) { if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); break; } } else { KASSERT(m_head != NULL, ("mbuf was freed")); error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ if_sendq_prepend(ifp, m_head); if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); break; } } } #ifdef INVARIANTS else { KASSERT(txr->hn_agg_txd != NULL, ("no aggregating txdesc")); KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); } #endif } /* Flush pending aggerated transmission. */ if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); return (sched); } static void hn_start(if_t ifp) { struct hn_softc *sc = if_getsoftc(ifp); struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } static void hn_start_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_start_txeof(struct hn_tx_ring *txr) { struct hn_softc *sc = txr->hn_sc; if_t ifp = sc->hn_ifp; KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the OACTIVE earlier, with the hope, that * others could catch up. The task will clear the * flag again with the hn_tx_lock to avoid possible * races. */ if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } #endif /* HN_IFSTART_SUPPORT */ static int hn_xmit(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; if_t ifp = sc->hn_ifp; struct mbuf *m_head; int sched = 0; mtx_assert(&txr->hn_tx_lock, MA_OWNED); #ifdef HN_IFSTART_SUPPORT KASSERT(hn_use_if_start == 0, ("hn_xmit is called, when if_start is enabled")); #endif KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); if (__predict_false(txr->hn_suspended)) return (0); if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) return (0); while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { struct hn_txdesc *txd; int error; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); sched = 1; break; } txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } error = hn_encap(ifp, txr, txd, &m_head); if (error) { /* Both txd and m_head are freed; discard */ KASSERT(txr->hn_agg_txd == NULL, ("encap failed w/ pending aggregating txdesc")); drbr_advance(ifp, txr->hn_mbuf_br); continue; } if (txr->hn_agg_pktleft == 0) { if (txr->hn_agg_txd != NULL) { KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); error = hn_flush_txagg(ifp, txr); if (__predict_false(error)) { txr->hn_oactive = 1; break; } } else { KASSERT(m_head != NULL, ("mbuf was freed")); error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } } } #ifdef INVARIANTS else { KASSERT(txr->hn_agg_txd != NULL, ("no aggregating txdesc")); KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); } #endif /* Sent */ drbr_advance(ifp, txr->hn_mbuf_br); } /* Flush pending aggerated transmission. */ if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); return (sched); } static int hn_transmit(if_t ifp, struct mbuf *m) { struct hn_softc *sc = if_getsoftc(ifp); struct hn_tx_ring *txr; int error, idx = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { struct rm_priotracker pt; rm_rlock(&sc->hn_vf_lock, &pt); if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { struct mbuf *m_bpf = NULL; int obytes, omcast; obytes = m->m_pkthdr.len; omcast = (m->m_flags & M_MCAST) != 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { - if (bpf_peers_present(if_getbpf(ifp))) { + if (bpf_peers_present_if(ifp)) { m_bpf = m_copypacket(m, M_NOWAIT); if (m_bpf == NULL) { /* * Failed to grab a shallow * copy; tap now. */ ETHER_BPF_MTAP(ifp, m); } } } else { ETHER_BPF_MTAP(ifp, m); } error = if_transmit(sc->hn_vf_ifp, m); rm_runlock(&sc->hn_vf_lock, &pt); if (m_bpf != NULL) { if (!error) ETHER_BPF_MTAP(ifp, m_bpf); m_freem(m_bpf); } if (error == ENOBUFS) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); } else if (error) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); if (omcast) { if_inc_counter(ifp, IFCOUNTER_OMCASTS, omcast); } } return (error); } rm_runlock(&sc->hn_vf_lock, &pt); } #if defined(INET6) || defined(INET) /* * Perform TSO packet header fixup or get l2/l3 header length now, * since packet headers should be cache-hot. */ if (m->m_pkthdr.csum_flags & CSUM_TSO) { m = hn_tso_fixup(m); if (__predict_false(m == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return EIO; } } else if (m->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { m = hn_set_hlen(m); if (__predict_false(m == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return EIO; } } #endif /* * Select the TX ring based on flowid */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { #ifdef RSS uint32_t bid; if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), &bid) == 0) idx = bid % sc->hn_tx_ring_inuse; else #endif { #if defined(INET6) || defined(INET) int tcpsyn = 0; if (m->m_pkthdr.len < 128 && (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) && (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { m = hn_check_tcpsyn(m, &tcpsyn); if (__predict_false(m == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (EIO); } } #else const int tcpsyn = 0; #endif if (tcpsyn) idx = 0; else idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; } } txr = &sc->hn_tx_ring[idx]; error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); if (error) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); return error; } if (txr->hn_oactive) return 0; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return 0; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); return 0; } static void hn_tx_ring_qflush(struct hn_tx_ring *txr) { struct mbuf *m; mtx_lock(&txr->hn_tx_lock); while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) m_freem(m); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_qflush(if_t ifp) { struct hn_softc *sc = if_getsoftc(ifp); struct rm_priotracker pt; int i; for (i = 0; i < sc->hn_tx_ring_inuse; ++i) hn_tx_ring_qflush(&sc->hn_tx_ring[i]); if_qflush(ifp); rm_rlock(&sc->hn_vf_lock, &pt); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) if_qflush(sc->hn_vf_ifp); rm_runlock(&sc->hn_vf_lock, &pt); } static void hn_xmit_txeof(struct hn_tx_ring *txr) { if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; txr->hn_oactive = 0; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the oactive earlier, with the hope, that * others could catch up. The task will clear the * oactive again with the hn_tx_lock to avoid possible * races. */ txr->hn_oactive = 0; taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_xmit_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); txr->hn_oactive = 0; hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static int hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) { struct vmbus_chan_br cbr; struct hn_rx_ring *rxr; struct hn_tx_ring *txr = NULL; int idx, error; idx = vmbus_chan_subidx(chan); /* * Link this channel to RX/TX ring. */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("RX ring %d already attached", idx)); rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; rxr->hn_chan = chan; if (bootverbose) { if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } if (idx < sc->hn_tx_ring_inuse) { txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("TX ring %d already attached", idx)); txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; txr->hn_chan = chan; if (bootverbose) { if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } } /* Bind this channel to a proper CPU. */ vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); /* * Open this channel */ cbr.cbr = rxr->hn_br; cbr.cbr_paddr = pmap_kextract((vm_offset_t)rxr->hn_br); cbr.cbr_txsz = HN_TXBR_SIZE; cbr.cbr_rxsz = HN_RXBR_SIZE; error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); if (error) { if (error == EISCONN) { if_printf(sc->hn_ifp, "bufring is connected after " "chan%u open failure\n", vmbus_chan_id(chan)); rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; } else { if_printf(sc->hn_ifp, "open chan%u failed: %d\n", vmbus_chan_id(chan), error); } } return (error); } static void hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) { struct hn_rx_ring *rxr; int idx, error; idx = vmbus_chan_subidx(chan); /* * Link this channel to RX/TX ring. */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), ("RX ring %d is not attached", idx)); rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; if (idx < sc->hn_tx_ring_inuse) { struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), ("TX ring %d is not attached attached", idx)); txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; } /* * Close this channel. * * NOTE: * Channel closing does _not_ destroy the target channel. */ error = vmbus_chan_close_direct(chan); if (error == EISCONN) { if_printf(sc->hn_ifp, "chan%u bufring is connected " "after being closed\n", vmbus_chan_id(chan)); rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; } else if (error) { if_printf(sc->hn_ifp, "chan%u close failed: %d\n", vmbus_chan_id(chan), error); } } static int hn_attach_subchans(struct hn_softc *sc) { struct vmbus_channel **subchans; int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i, error = 0; KASSERT(subchan_cnt > 0, ("no sub-channels")); /* Attach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) { int error1; error1 = hn_chan_attach(sc, subchans[i]); if (error1) { error = error1; /* Move on; all channels will be detached later. */ } } vmbus_subchan_rel(subchans, subchan_cnt); if (error) { if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); } else { if (bootverbose) { if_printf(sc->hn_ifp, "%d sub-channels attached\n", subchan_cnt); } } return (error); } static void hn_detach_allchans(struct hn_softc *sc) { struct vmbus_channel **subchans; int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i; if (subchan_cnt == 0) goto back; /* Detach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) hn_chan_detach(sc, subchans[i]); vmbus_subchan_rel(subchans, subchan_cnt); back: /* * Detach the primary channel, _after_ all sub-channels * are detached. */ hn_chan_detach(sc, sc->hn_prichan); /* Wait for sub-channels to be destroyed, if any. */ vmbus_subchan_drain(sc->hn_prichan); #ifdef INVARIANTS for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { KASSERT((sc->hn_rx_ring[i].hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("%dth RX ring is still attached", i)); } for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { KASSERT((sc->hn_tx_ring[i].hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("%dth TX ring is still attached", i)); } #endif } static int hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) { struct vmbus_channel **subchans; int nchan, rxr_cnt, error; nchan = *nsubch + 1; if (nchan == 1) { /* * Multiple RX/TX rings are not requested. */ *nsubch = 0; return (0); } /* * Query RSS capabilities, e.g. # of RX rings, and # of indirect * table entries. */ error = hn_rndis_query_rsscaps(sc, &rxr_cnt); if (error) { /* No RSS; this is benign. */ *nsubch = 0; return (0); } if (bootverbose) { if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", rxr_cnt, nchan); } if (nchan > rxr_cnt) nchan = rxr_cnt; if (nchan == 1) { if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); *nsubch = 0; return (0); } /* * Allocate sub-channels from NVS. */ *nsubch = nchan - 1; error = hn_nvs_alloc_subchans(sc, nsubch); if (error || *nsubch == 0) { /* Failed to allocate sub-channels. */ *nsubch = 0; return (0); } /* * Wait for all sub-channels to become ready before moving on. */ subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); vmbus_subchan_rel(subchans, *nsubch); return (0); } static bool hn_synth_attachable(const struct hn_softc *sc) { int i; if (sc->hn_flags & HN_FLAG_ERRORS) return (false); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) return (false); } return (true); } /* * Make sure that the RX filter is zero after the successful * RNDIS initialization. * * NOTE: * Under certain conditions on certain versions of Hyper-V, * the RNDIS rxfilter is _not_ zero on the hypervisor side * after the successful RNDIS initialization, which breaks * the assumption of any following code (well, it breaks the * RNDIS API contract actually). Clear the RNDIS rxfilter * explicitly, drain packets sneaking through, and drain the * interrupt taskqueues scheduled due to the stealth packets. */ static void hn_rndis_init_fixat(struct hn_softc *sc, int nchan) { hn_disable_rx(sc); hn_drain_rxtx(sc, nchan); } static int hn_synth_attach(struct hn_softc *sc, int mtu) { #define ATTACHED_NVS 0x0002 #define ATTACHED_RNDIS 0x0004 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int error, nsubch, nchan = 1, i, rndis_inited; uint32_t old_caps, attached = 0; KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, ("synthetic parts were attached")); if (!hn_synth_attachable(sc)) return (ENXIO); /* Save capabilities for later verification. */ old_caps = sc->hn_caps; sc->hn_caps = 0; /* Clear RSS stuffs. */ sc->hn_rss_ind_size = 0; sc->hn_rss_hash = 0; sc->hn_rss_hcap = 0; /* * Attach the primary channel _before_ attaching NVS and RNDIS. */ error = hn_chan_attach(sc, sc->hn_prichan); if (error) goto failed; /* * Attach NVS. */ error = hn_nvs_attach(sc, mtu); if (error) goto failed; attached |= ATTACHED_NVS; /* * Attach RNDIS _after_ NVS is attached. */ error = hn_rndis_attach(sc, mtu, &rndis_inited); if (rndis_inited) attached |= ATTACHED_RNDIS; if (error) goto failed; /* * Make sure capabilities are not changed. */ if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", old_caps, sc->hn_caps); error = ENXIO; goto failed; } /* * Allocate sub-channels for multi-TX/RX rings. * * NOTE: * The # of RX rings that can be used is equivalent to the # of * channels to be requested. */ nsubch = sc->hn_rx_ring_cnt - 1; error = hn_synth_alloc_subchans(sc, &nsubch); if (error) goto failed; /* NOTE: _Full_ synthetic parts detach is required now. */ sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; /* * Set the # of TX/RX rings that could be used according to * the # of channels that NVS offered. */ nchan = nsubch + 1; hn_set_ring_inuse(sc, nchan); if (nchan == 1) { /* Only the primary channel can be used; done */ goto back; } /* * Attach the sub-channels. * * NOTE: hn_set_ring_inuse() _must_ have been called. */ error = hn_attach_subchans(sc); if (error) goto failed; /* * Configure RSS key and indirect table _after_ all sub-channels * are attached. */ if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { /* * RSS key is not set yet; set it to the default RSS key. */ if (bootverbose) if_printf(sc->hn_ifp, "setup default RSS key\n"); #ifdef RSS rss_getkey(rss->rss_key); #else memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); #endif sc->hn_flags |= HN_FLAG_HAS_RSSKEY; } if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { /* * RSS indirect table is not set yet; set it up in round- * robin fashion. */ if (bootverbose) { if_printf(sc->hn_ifp, "setup default RSS indirect " "table\n"); } for (i = 0; i < NDIS_HASH_INDCNT; ++i) { uint32_t subidx; #ifdef RSS subidx = rss_get_indirection_to_bucket(i); #else subidx = i; #endif rss->rss_ind[i] = subidx % nchan; } sc->hn_flags |= HN_FLAG_HAS_RSSIND; } else { /* * # of usable channels may be changed, so we have to * make sure that all entries in RSS indirect table * are valid. * * NOTE: hn_set_ring_inuse() _must_ have been called. */ hn_rss_ind_fixup(sc); } sc->hn_rss_hash = sc->hn_rss_hcap; if ((sc->hn_flags & HN_FLAG_RXVF) || (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { /* NOTE: Don't reconfigure RSS; will do immediately. */ hn_vf_rss_fixup(sc, false); } error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); if (error) goto failed; back: /* * Fixup transmission aggregation setup. */ hn_set_txagg(sc); hn_rndis_init_fixat(sc, nchan); return (0); failed: if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { hn_rndis_init_fixat(sc, nchan); hn_synth_detach(sc); } else { if (attached & ATTACHED_RNDIS) { hn_rndis_init_fixat(sc, nchan); hn_rndis_detach(sc); } if (attached & ATTACHED_NVS) hn_nvs_detach(sc); hn_chan_detach(sc, sc->hn_prichan); /* Restore old capabilities. */ sc->hn_caps = old_caps; } return (error); #undef ATTACHED_RNDIS #undef ATTACHED_NVS } /* * NOTE: * The interface must have been suspended though hn_suspend(), before * this function get called. */ static void hn_synth_detach(struct hn_softc *sc) { KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); /* Detach the RNDIS first. */ hn_rndis_detach(sc); /* Detach NVS. */ hn_nvs_detach(sc); /* Detach all of the channels. */ hn_detach_allchans(sc); if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { /* * Host is post-Win2016, disconnect RXBUF from primary channel here. */ int error; error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, sc->hn_rxbuf_gpadl); if (error) { if_printf(sc->hn_ifp, "rxbuf gpadl disconn failed: %d\n", error); sc->hn_flags |= HN_FLAG_RXBUF_REF; } sc->hn_rxbuf_gpadl = 0; } if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { /* * Host is post-Win2016, disconnect chimney sending buffer from * primary channel here. */ int error; error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, sc->hn_chim_gpadl); if (error) { if_printf(sc->hn_ifp, "chim gpadl disconn failed: %d\n", error); sc->hn_flags |= HN_FLAG_CHIM_REF; } sc->hn_chim_gpadl = 0; } sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; } static void hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) { KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, ("invalid ring count %d", ring_cnt)); if (sc->hn_tx_ring_cnt > ring_cnt) sc->hn_tx_ring_inuse = ring_cnt; else sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_rx_ring_inuse = ring_cnt; #ifdef RSS if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, rss_getnumbuckets()); } #endif if (bootverbose) { if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); } } static void hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) { /* * NOTE: * The TX bufring will not be drained by the hypervisor, * if the primary channel is revoked. */ while (!vmbus_chan_rx_empty(chan) || (!vmbus_chan_is_revoked(sc->hn_prichan) && !vmbus_chan_tx_empty(chan))) pause("waitch", 1); vmbus_chan_intr_drain(chan); } static void hn_disable_rx(struct hn_softc *sc) { /* * Disable RX by clearing RX filter forcefully. */ sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ /* * Give RNDIS enough time to flush all pending data packets. */ pause("waitrx", (200 * hz) / 1000); } /* * NOTE: * RX/TX _must_ have been suspended/disabled, before this function * is called. */ static void hn_drain_rxtx(struct hn_softc *sc, int nchan) { struct vmbus_channel **subch = NULL; int nsubch; /* * Drain RX/TX bufrings and interrupts. */ nsubch = nchan - 1; if (nsubch > 0) subch = vmbus_subchan_get(sc->hn_prichan, nsubch); if (subch != NULL) { int i; for (i = 0; i < nsubch; ++i) hn_chan_drain(sc, subch[i]); } hn_chan_drain(sc, sc->hn_prichan); if (subch != NULL) vmbus_subchan_rel(subch, nsubch); } static void hn_suspend_data(struct hn_softc *sc) { struct hn_tx_ring *txr; int i; HN_LOCK_ASSERT(sc); /* * Suspend TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 1; mtx_unlock(&txr->hn_tx_lock); /* No one is able send more packets now. */ /* * Wait for all pending sends to finish. * * NOTE: * We will _not_ receive all pending send-done, if the * primary channel is revoked. */ while (hn_tx_ring_pending(txr) && !vmbus_chan_is_revoked(sc->hn_prichan)) pause("hnwtx", 1 /* 1 tick */); } /* * Disable RX. */ hn_disable_rx(sc); /* * Drain RX/TX. */ hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); /* * Drain any pending TX tasks. * * NOTE: * The above hn_drain_rxtx() can dispatch TX tasks, so the TX * tasks will have to be drained _after_ the above hn_drain_rxtx(). */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) { ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; } static void hn_suspend_mgmt(struct hn_softc *sc) { struct task task; HN_LOCK_ASSERT(sc); /* * Make sure that hn_mgmt_taskq0 can nolonger be accessed * through hn_mgmt_taskq. */ TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); vmbus_chan_run_task(sc->hn_prichan, &task); /* * Make sure that all pending management tasks are completed. */ taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); taskqueue_drain_all(sc->hn_mgmt_taskq0); } static void hn_suspend(struct hn_softc *sc) { /* Disable polling. */ hn_polling(sc, 0); /* * If the non-transparent mode VF is activated, the synthetic * device is receiving packets, so the data path of the * synthetic device must be suspended. */ if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) || (sc->hn_flags & HN_FLAG_RXVF)) hn_suspend_data(sc); hn_suspend_mgmt(sc); } static void hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) { int i; KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, ("invalid TX ring count %d", tx_ring_cnt)); for (i = 0; i < tx_ring_cnt; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 0; mtx_unlock(&txr->hn_tx_lock); } } static void hn_resume_data(struct hn_softc *sc) { int i; HN_LOCK_ASSERT(sc); /* * Re-enable RX. */ hn_rxfilter_config(sc); /* * Make sure to clear suspend status on "all" TX rings, * since hn_tx_ring_inuse can be changed after * hn_suspend_data(). */ hn_resume_tx(sc, sc->hn_tx_ring_cnt); #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { /* * Flush unused drbrs, since hn_tx_ring_inuse may be * reduced. */ for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) hn_tx_ring_qflush(&sc->hn_tx_ring[i]); } /* * Kick start TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; /* * Use txeof task, so that any pending oactive can be * cleared properly. */ taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_resume_mgmt(struct hn_softc *sc) { sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; /* * Kick off network change detection, if it was pending. * If no network change was pending, start link status * checks, which is more lightweight than network change * detection. */ if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) hn_change_network(sc); else hn_update_link_status(sc); } static void hn_resume(struct hn_softc *sc) { /* * If the non-transparent mode VF is activated, the synthetic * device have to receive packets, so the data path of the * synthetic device must be resumed. */ if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) || (sc->hn_flags & HN_FLAG_RXVF)) hn_resume_data(sc); /* * Don't resume link status change if VF is attached/activated. * - In the non-transparent VF mode, the synthetic device marks * link down until the VF is deactivated; i.e. VF is down. * - In transparent VF mode, VF's media status is used until * the VF is detached. */ if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) hn_resume_mgmt(sc); /* * Re-enable polling if this interface is running and * the polling is requested. */ if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) hn_polling(sc, sc->hn_pollhz); } static void hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) { const struct rndis_status_msg *msg; int ofs; if (dlen < sizeof(*msg)) { if_printf(sc->hn_ifp, "invalid RNDIS status\n"); return; } msg = data; switch (msg->rm_status) { case RNDIS_STATUS_MEDIA_CONNECT: case RNDIS_STATUS_MEDIA_DISCONNECT: hn_update_link_status(sc); break; case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: case RNDIS_STATUS_LINK_SPEED_CHANGE: /* Not really useful; ignore. */ break; case RNDIS_STATUS_NETWORK_CHANGE: ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); if (dlen < ofs + msg->rm_stbuflen || msg->rm_stbuflen < sizeof(uint32_t)) { if_printf(sc->hn_ifp, "network changed\n"); } else { uint32_t change; memcpy(&change, ((const uint8_t *)msg) + ofs, sizeof(change)); if_printf(sc->hn_ifp, "network changed, change %u\n", change); } hn_change_network(sc); break; default: if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", msg->rm_status); break; } } static int hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) { const struct rndis_pktinfo *pi = info_data; uint32_t mask = 0; while (info_dlen != 0) { const void *data; uint32_t dlen; if (__predict_false(info_dlen < sizeof(*pi))) return (EINVAL); if (__predict_false(info_dlen < pi->rm_size)) return (EINVAL); info_dlen -= pi->rm_size; if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) return (EINVAL); if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) return (EINVAL); dlen = pi->rm_size - pi->rm_pktinfooffset; data = pi->rm_data; if (pi->rm_internal == 1) { switch (pi->rm_type) { case NDIS_PKTINFO_IT_PKTINFO_ID: if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) return (EINVAL); info->pktinfo_id = (const struct packet_info_id *)data; mask |= HN_RXINFO_PKTINFO_ID; break; default: goto next; } } else { switch (pi->rm_type) { case NDIS_PKTINFO_TYPE_VLAN: if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) return (EINVAL); info->vlan_info = (const uint32_t *)data; mask |= HN_RXINFO_VLAN; break; case NDIS_PKTINFO_TYPE_CSUM: if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) return (EINVAL); info->csum_info = (const uint32_t *)data; mask |= HN_RXINFO_CSUM; break; case HN_NDIS_PKTINFO_TYPE_HASHVAL: if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) return (EINVAL); info->hash_value = (const uint32_t *)data; mask |= HN_RXINFO_HASHVAL; break; case HN_NDIS_PKTINFO_TYPE_HASHINF: if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) return (EINVAL); info->hash_info = (const uint32_t *)data; mask |= HN_RXINFO_HASHINF; break; default: goto next; } } if (mask == HN_RXINFO_ALL) { /* All found; done */ break; } next: pi = (const struct rndis_pktinfo *) ((const uint8_t *)pi + pi->rm_size); } /* * Final fixup. * - If there is no hash value, invalidate the hash info. */ if ((mask & HN_RXINFO_HASHVAL) == 0) info->hash_info = NULL; return (0); } static __inline bool hn_rndis_check_overlap(int off, int len, int check_off, int check_len) { if (off < check_off) { if (__predict_true(off + len <= check_off)) return (false); } else if (off > check_off) { if (__predict_true(check_off + check_len <= off)) return (false); } return (true); } static __inline void hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, uint32_t len, struct hn_rxinfo *info) { uint32_t cnt = rxr->rsc.cnt; if (cnt) { rxr->rsc.pktlen += len; } else { rxr->rsc.vlan_info = info->vlan_info; rxr->rsc.csum_info = info->csum_info; rxr->rsc.hash_info = info->hash_info; rxr->rsc.hash_value = info->hash_value; rxr->rsc.pktlen = len; } rxr->rsc.frag_data[cnt] = data; rxr->rsc.frag_len[cnt] = len; rxr->rsc.cnt++; } static void hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) { const struct rndis_packet_msg *pkt; struct hn_rxinfo info; int data_off, pktinfo_off, data_len, pktinfo_len; bool rsc_more= false; /* * Check length. */ if (__predict_false(dlen < sizeof(*pkt))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); return; } pkt = data; if (__predict_false(dlen < pkt->rm_len)) { if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " "dlen %d, msglen %u\n", dlen, pkt->rm_len); return; } if (__predict_false(pkt->rm_len < pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " "msglen %u, data %u, oob %u, pktinfo %u\n", pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, pkt->rm_pktinfolen); return; } if (__predict_false(pkt->rm_datalen == 0)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); return; } /* * Check offests. */ #define IS_OFFSET_INVALID(ofs) \ ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) /* XXX Hyper-V does not meet data offset alignment requirement */ if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "data offset %u\n", pkt->rm_dataoffset); return; } if (__predict_false(pkt->rm_oobdataoffset > 0 && IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob offset %u\n", pkt->rm_oobdataoffset); return; } if (__predict_true(pkt->rm_pktinfooffset > 0) && __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo offset %u\n", pkt->rm_pktinfooffset); return; } #undef IS_OFFSET_INVALID data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); data_len = pkt->rm_datalen; pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); pktinfo_len = pkt->rm_pktinfolen; /* * Check OOB coverage. */ if (__predict_false(pkt->rm_oobdatalen != 0)) { int oob_off, oob_len; if_printf(rxr->hn_ifp, "got oobdata\n"); oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); oob_len = pkt->rm_oobdatalen; if (__predict_false(oob_off + oob_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overflow, msglen %u, oob abs %d len %d\n", pkt->rm_len, oob_off, oob_len); return; } /* * Check against data. */ if (hn_rndis_check_overlap(oob_off, oob_len, data_off, data_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overlaps data, oob abs %d len %d, " "data abs %d len %d\n", oob_off, oob_len, data_off, data_len); return; } /* * Check against pktinfo. */ if (pktinfo_len != 0 && hn_rndis_check_overlap(oob_off, oob_len, pktinfo_off, pktinfo_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overlaps pktinfo, oob abs %d len %d, " "pktinfo abs %d len %d\n", oob_off, oob_len, pktinfo_off, pktinfo_len); return; } } /* * Check per-packet-info coverage and find useful per-packet-info. */ info.vlan_info = NULL; info.csum_info = NULL; info.hash_info = NULL; info.pktinfo_id = NULL; if (__predict_true(pktinfo_len != 0)) { bool overlap; int error; if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo overflow, msglen %u, " "pktinfo abs %d len %d\n", pkt->rm_len, pktinfo_off, pktinfo_len); return; } /* * Check packet info coverage. */ overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, data_off, data_len); if (__predict_false(overlap)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo overlap data, pktinfo abs %d len %d, " "data abs %d len %d\n", pktinfo_off, pktinfo_len, data_off, data_len); return; } /* * Find useful per-packet-info. */ error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, pktinfo_len, &info); if (__predict_false(error)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " "pktinfo\n"); return; } } if (__predict_false(data_off + data_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "data overflow, msglen %u, data abs %d len %d\n", pkt->rm_len, data_off, data_len); return; } /* Identify RSC fragments, drop invalid packets */ if ((info.pktinfo_id != NULL) && (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { rxr->rsc.cnt = 0; rxr->hn_rsc_pkts++; } else if (rxr->rsc.cnt == 0) goto drop; rsc_more = true; if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) rsc_more = false; if (rsc_more && rxr->rsc.is_last) goto drop; } else { rxr->rsc.cnt = 0; } if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) goto drop; /* Store data in per rx ring structure */ hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, data_len, &info); if (rsc_more) return; hn_rxpkt(rxr); rxr->rsc.cnt = 0; return; drop: rxr->hn_rsc_drop++; return; } static __inline void hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) { const struct rndis_msghdr *hdr; if (__predict_false(dlen < sizeof(*hdr))) { if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); return; } hdr = data; if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { /* Hot data path. */ hn_rndis_rx_data(rxr, data, dlen); /* Done! */ return; } if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen); else hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen); } static void hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) { const struct hn_nvs_hdr *hdr; if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { if_printf(sc->hn_ifp, "invalid nvs notify\n"); return; } hdr = VMBUS_CHANPKT_CONST_DATA(pkt); if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { /* Useless; ignore */ return; } if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); } static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkt) { struct hn_nvs_sendctx *sndc; sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), VMBUS_CHANPKT_DATALEN(pkt)); /* * NOTE: * 'sndc' CAN NOT be accessed anymore, since it can be freed by * its callback. */ } static void hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr) { struct epoch_tracker et; const struct vmbus_chanpkt_rxbuf *pkt; const struct hn_nvs_hdr *nvs_hdr; int count, i, hlen; if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); return; } nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); /* Make sure that this is a RNDIS message. */ if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", nvs_hdr->nvs_type); return; } hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); if (__predict_false(hlen < sizeof(*pkt))) { if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); return; } pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", pkt->cp_rxbuf_id); return; } count = pkt->cp_rxbuf_cnt; if (__predict_false(hlen < __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); return; } NET_EPOCH_ENTER(et); /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ for (i = 0; i < count; ++i) { int ofs, len; ofs = pkt->cp_rxbuf[i].rb_ofs; len = pkt->cp_rxbuf[i].rb_len; if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " "ofs %d, len %d\n", i, ofs, len); continue; } rxr->rsc.is_last = (i == (count - 1)); hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); } NET_EPOCH_EXIT(et); /* * Ack the consumed RXBUF associated w/ this channel packet, * so that this RXBUF can be recycled by the hypervisor. */ hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); } static void hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, uint64_t tid) { struct hn_nvs_rndis_ack ack; int retries, error; ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; ack.nvs_status = HN_NVS_STATUS_OK; retries = 0; again: error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); if (__predict_false(error == EAGAIN)) { /* * NOTE: * This should _not_ happen in real world, since the * consumption of the TX bufring from the TX path is * controlled. */ if (rxr->hn_ack_failed == 0) if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); rxr->hn_ack_failed++; retries++; if (retries < 10) { DELAY(100); goto again; } /* RXBUF leaks! */ if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); } } static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr) { struct hn_rx_ring *rxr = xrxr; struct hn_softc *sc = if_getsoftc(rxr->hn_ifp); for (;;) { struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; int error, pktlen; pktlen = rxr->hn_pktbuf_len; error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); if (__predict_false(error == ENOBUFS)) { void *nbuf; int nlen; /* * Expand channel packet buffer. * * XXX * Use M_WAITOK here, since allocation failure * is fatal. */ nlen = rxr->hn_pktbuf_len * 2; while (nlen < pktlen) nlen *= 2; nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", rxr->hn_pktbuf_len, nlen); free(rxr->hn_pktbuf, M_DEVBUF); rxr->hn_pktbuf = nbuf; rxr->hn_pktbuf_len = nlen; /* Retry! */ continue; } else if (__predict_false(error == EAGAIN)) { /* No more channel packets; done! */ break; } KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); switch (pkt->cph_type) { case VMBUS_CHANPKT_TYPE_COMP: hn_nvs_handle_comp(sc, chan, pkt); break; case VMBUS_CHANPKT_TYPE_RXBUF: hn_nvs_handle_rxbuf(rxr, chan, pkt); break; case VMBUS_CHANPKT_TYPE_INBAND: hn_nvs_handle_notify(sc, pkt); break; default: if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", pkt->cph_type); break; } } hn_chan_rollup(rxr, rxr->hn_txr); } static void hn_sysinit(void *arg __unused) { int i; hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); #ifdef HN_IFSTART_SUPPORT /* * Don't use ifnet.if_start if transparent VF mode is requested; * mainly due to the IFF_DRV_OACTIVE flag. */ if (hn_xpnt_vf && hn_use_if_start) { hn_use_if_start = 0; printf("hn: tranparent VF mode, if_transmit will be used, " "instead of if_start\n"); } #endif if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { printf("hn: invalid transparent VF attach routing " "wait timeout %d, reset to %d\n", hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; } /* * Initialize VF map. */ rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); hn_vfmap_size = HN_VFMAP_SIZE_DEF; hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF, M_WAITOK | M_ZERO); /* * Fix the # of TX taskqueues. */ if (hn_tx_taskq_cnt <= 0) hn_tx_taskq_cnt = 1; else if (hn_tx_taskq_cnt > mp_ncpus) hn_tx_taskq_cnt = mp_ncpus; /* * Fix the TX taskqueue mode. */ switch (hn_tx_taskq_mode) { case HN_TX_TASKQ_M_INDEP: case HN_TX_TASKQ_M_GLOBAL: case HN_TX_TASKQ_M_EVTTQ: break; default: hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; break; } if (vm_guest != VM_GUEST_HV) return; if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) return; hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), M_DEVBUF, M_WAITOK); for (i = 0; i < hn_tx_taskq_cnt; ++i) { hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &hn_tx_taskque[i]); taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, "hn tx%d", i); } } SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); static void hn_sysuninit(void *arg __unused) { if (hn_tx_taskque != NULL) { int i; for (i = 0; i < hn_tx_taskq_cnt; ++i) taskqueue_free(hn_tx_taskque[i]); free(hn_tx_taskque, M_DEVBUF); } if (hn_vfmap != NULL) free(hn_vfmap, M_DEVBUF); rm_destroy(&hn_vfmap_lock); counter_u64_free(hn_udpcs_fixup); } SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); diff --git a/sys/dev/my/if_my.c b/sys/dev/my/if_my.c index 2bf4573d337b..631c38df9dca 100644 --- a/sys/dev/my/if_my.c +++ b/sys/dev/my/if_my.c @@ -1,1761 +1,1761 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Written by: yen_cw@myson.com.tw * Copyright (c) 2002 Myson Technology Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Myson fast ethernet PCI NIC driver, available at: http://www.myson.com.tw/ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define NBPFILTER 1 #include #include #include #include #include #include #include #include #include /* for vtophys */ #include /* for vtophys */ #include #include #include #include #include #include /* * #define MY_USEIOSPACE */ static int MY_USEIOSPACE = 1; #ifdef MY_USEIOSPACE #define MY_RES SYS_RES_IOPORT #define MY_RID MY_PCI_LOIO #else #define MY_RES SYS_RES_MEMORY #define MY_RID MY_PCI_LOMEM #endif #include /* * Various supported device vendors/types and their names. */ struct my_type *my_info_tmp; static struct my_type my_devs[] = { {MYSONVENDORID, MTD800ID, "Myson MTD80X Based Fast Ethernet Card"}, {MYSONVENDORID, MTD803ID, "Myson MTD80X Based Fast Ethernet Card"}, {MYSONVENDORID, MTD891ID, "Myson MTD89X Based Giga Ethernet Card"}, {0, 0, NULL} }; /* * Various supported PHY vendors/types and their names. Note that this driver * will work with pretty much any MII-compliant PHY, so failure to positively * identify the chip is not a fatal error. */ static struct my_type my_phys[] = { {MysonPHYID0, MysonPHYID0, ""}, {SeeqPHYID0, SeeqPHYID0, ""}, {AhdocPHYID0, AhdocPHYID0, ""}, {MarvellPHYID0, MarvellPHYID0, ""}, {LevelOnePHYID0, LevelOnePHYID0, ""}, {0, 0, ""} }; static int my_probe(device_t); static int my_attach(device_t); static int my_detach(device_t); static int my_newbuf(struct my_softc *, struct my_chain_onefrag *); static int my_encap(struct my_softc *, struct my_chain *, struct mbuf *); static void my_rxeof(struct my_softc *); static void my_txeof(struct my_softc *); static void my_txeoc(struct my_softc *); static void my_intr(void *); static void my_start(if_t); static void my_start_locked(if_t); static int my_ioctl(if_t, u_long, caddr_t); static void my_init(void *); static void my_init_locked(struct my_softc *); static void my_stop(struct my_softc *); static void my_autoneg_timeout(void *); static void my_watchdog(void *); static int my_shutdown(device_t); static int my_ifmedia_upd(if_t); static void my_ifmedia_sts(if_t, struct ifmediareq *); static u_int16_t my_phy_readreg(struct my_softc *, int); static void my_phy_writereg(struct my_softc *, int, int); static void my_autoneg_xmit(struct my_softc *); static void my_autoneg_mii(struct my_softc *, int, int); static void my_setmode_mii(struct my_softc *, int); static void my_getmode_mii(struct my_softc *); static void my_setcfg(struct my_softc *, int); static void my_setmulti(struct my_softc *); static void my_reset(struct my_softc *); static int my_list_rx_init(struct my_softc *); static int my_list_tx_init(struct my_softc *); static long my_send_cmd_to_phy(struct my_softc *, int, int); #define MY_SETBIT(sc, reg, x) CSR_WRITE_4(sc, reg, CSR_READ_4(sc, reg) | (x)) #define MY_CLRBIT(sc, reg, x) CSR_WRITE_4(sc, reg, CSR_READ_4(sc, reg) & ~(x)) static device_method_t my_methods[] = { /* Device interface */ DEVMETHOD(device_probe, my_probe), DEVMETHOD(device_attach, my_attach), DEVMETHOD(device_detach, my_detach), DEVMETHOD(device_shutdown, my_shutdown), DEVMETHOD_END }; static driver_t my_driver = { "my", my_methods, sizeof(struct my_softc) }; DRIVER_MODULE(my, pci, my_driver, 0, 0); MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, my, my_devs, nitems(my_devs) - 1); MODULE_DEPEND(my, pci, 1, 1, 1); MODULE_DEPEND(my, ether, 1, 1, 1); static long my_send_cmd_to_phy(struct my_softc * sc, int opcode, int regad) { long miir; int i; int mask, data; MY_LOCK_ASSERT(sc); /* enable MII output */ miir = CSR_READ_4(sc, MY_MANAGEMENT); miir &= 0xfffffff0; miir |= MY_MASK_MIIR_MII_WRITE + MY_MASK_MIIR_MII_MDO; /* send 32 1's preamble */ for (i = 0; i < 32; i++) { /* low MDC; MDO is already high (miir) */ miir &= ~MY_MASK_MIIR_MII_MDC; CSR_WRITE_4(sc, MY_MANAGEMENT, miir); /* high MDC */ miir |= MY_MASK_MIIR_MII_MDC; CSR_WRITE_4(sc, MY_MANAGEMENT, miir); } /* calculate ST+OP+PHYAD+REGAD+TA */ data = opcode | (sc->my_phy_addr << 7) | (regad << 2); /* sent out */ mask = 0x8000; while (mask) { /* low MDC, prepare MDO */ miir &= ~(MY_MASK_MIIR_MII_MDC + MY_MASK_MIIR_MII_MDO); if (mask & data) miir |= MY_MASK_MIIR_MII_MDO; CSR_WRITE_4(sc, MY_MANAGEMENT, miir); /* high MDC */ miir |= MY_MASK_MIIR_MII_MDC; CSR_WRITE_4(sc, MY_MANAGEMENT, miir); DELAY(30); /* next */ mask >>= 1; if (mask == 0x2 && opcode == MY_OP_READ) miir &= ~MY_MASK_MIIR_MII_WRITE; } return miir; } static u_int16_t my_phy_readreg(struct my_softc * sc, int reg) { long miir; int mask, data; MY_LOCK_ASSERT(sc); if (sc->my_info->my_did == MTD803ID) data = CSR_READ_2(sc, MY_PHYBASE + reg * 2); else { miir = my_send_cmd_to_phy(sc, MY_OP_READ, reg); /* read data */ mask = 0x8000; data = 0; while (mask) { /* low MDC */ miir &= ~MY_MASK_MIIR_MII_MDC; CSR_WRITE_4(sc, MY_MANAGEMENT, miir); /* read MDI */ miir = CSR_READ_4(sc, MY_MANAGEMENT); if (miir & MY_MASK_MIIR_MII_MDI) data |= mask; /* high MDC, and wait */ miir |= MY_MASK_MIIR_MII_MDC; CSR_WRITE_4(sc, MY_MANAGEMENT, miir); DELAY(30); /* next */ mask >>= 1; } /* low MDC */ miir &= ~MY_MASK_MIIR_MII_MDC; CSR_WRITE_4(sc, MY_MANAGEMENT, miir); } return (u_int16_t) data; } static void my_phy_writereg(struct my_softc * sc, int reg, int data) { long miir; int mask; MY_LOCK_ASSERT(sc); if (sc->my_info->my_did == MTD803ID) CSR_WRITE_2(sc, MY_PHYBASE + reg * 2, data); else { miir = my_send_cmd_to_phy(sc, MY_OP_WRITE, reg); /* write data */ mask = 0x8000; while (mask) { /* low MDC, prepare MDO */ miir &= ~(MY_MASK_MIIR_MII_MDC + MY_MASK_MIIR_MII_MDO); if (mask & data) miir |= MY_MASK_MIIR_MII_MDO; CSR_WRITE_4(sc, MY_MANAGEMENT, miir); DELAY(1); /* high MDC */ miir |= MY_MASK_MIIR_MII_MDC; CSR_WRITE_4(sc, MY_MANAGEMENT, miir); DELAY(1); /* next */ mask >>= 1; } /* low MDC */ miir &= ~MY_MASK_MIIR_MII_MDC; CSR_WRITE_4(sc, MY_MANAGEMENT, miir); } return; } static u_int my_hash_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt) { uint32_t *hashes = arg; int h; h = ~ether_crc32_be(LLADDR(sdl), ETHER_ADDR_LEN) >> 26; if (h < 32) hashes[0] |= (1 << h); else hashes[1] |= (1 << (h - 32)); return (1); } /* * Program the 64-bit multicast hash filter. */ static void my_setmulti(struct my_softc * sc) { if_t ifp; u_int32_t hashes[2] = {0, 0}; u_int32_t rxfilt; MY_LOCK_ASSERT(sc); ifp = sc->my_ifp; rxfilt = CSR_READ_4(sc, MY_TCRRCR); if (if_getflags(ifp) & IFF_ALLMULTI || if_getflags(ifp) & IFF_PROMISC) { rxfilt |= MY_AM; CSR_WRITE_4(sc, MY_TCRRCR, rxfilt); CSR_WRITE_4(sc, MY_MAR0, 0xFFFFFFFF); CSR_WRITE_4(sc, MY_MAR1, 0xFFFFFFFF); return; } /* first, zot all the existing hash bits */ CSR_WRITE_4(sc, MY_MAR0, 0); CSR_WRITE_4(sc, MY_MAR1, 0); /* now program new ones */ if (if_foreach_llmaddr(ifp, my_hash_maddr, hashes) > 0) rxfilt |= MY_AM; else rxfilt &= ~MY_AM; CSR_WRITE_4(sc, MY_MAR0, hashes[0]); CSR_WRITE_4(sc, MY_MAR1, hashes[1]); CSR_WRITE_4(sc, MY_TCRRCR, rxfilt); } /* * Initiate an autonegotiation session. */ static void my_autoneg_xmit(struct my_softc * sc) { u_int16_t phy_sts = 0; MY_LOCK_ASSERT(sc); my_phy_writereg(sc, PHY_BMCR, PHY_BMCR_RESET); DELAY(500); while (my_phy_readreg(sc, PHY_BMCR) & PHY_BMCR_RESET); phy_sts = my_phy_readreg(sc, PHY_BMCR); phy_sts |= PHY_BMCR_AUTONEGENBL | PHY_BMCR_AUTONEGRSTR; my_phy_writereg(sc, PHY_BMCR, phy_sts); return; } static void my_autoneg_timeout(void *arg) { struct my_softc *sc; sc = arg; MY_LOCK_ASSERT(sc); my_autoneg_mii(sc, MY_FLAG_DELAYTIMEO, 1); } /* * Invoke autonegotiation on a PHY. */ static void my_autoneg_mii(struct my_softc * sc, int flag, int verbose) { u_int16_t phy_sts = 0, media, advert, ability; u_int16_t ability2 = 0; if_t ifp; struct ifmedia *ifm; MY_LOCK_ASSERT(sc); ifm = &sc->ifmedia; ifp = sc->my_ifp; ifm->ifm_media = IFM_ETHER | IFM_AUTO; #ifndef FORCE_AUTONEG_TFOUR /* * First, see if autoneg is supported. If not, there's no point in * continuing. */ phy_sts = my_phy_readreg(sc, PHY_BMSR); if (!(phy_sts & PHY_BMSR_CANAUTONEG)) { if (verbose) device_printf(sc->my_dev, "autonegotiation not supported\n"); ifm->ifm_media = IFM_ETHER | IFM_10_T | IFM_HDX; return; } #endif switch (flag) { case MY_FLAG_FORCEDELAY: /* * XXX Never use this option anywhere but in the probe * routine: making the kernel stop dead in its tracks for * three whole seconds after we've gone multi-user is really * bad manners. */ my_autoneg_xmit(sc); DELAY(5000000); break; case MY_FLAG_SCHEDDELAY: /* * Wait for the transmitter to go idle before starting an * autoneg session, otherwise my_start() may clobber our * timeout, and we don't want to allow transmission during an * autoneg session since that can screw it up. */ if (sc->my_cdata.my_tx_head != NULL) { sc->my_want_auto = 1; MY_UNLOCK(sc); return; } my_autoneg_xmit(sc); callout_reset(&sc->my_autoneg_timer, hz * 5, my_autoneg_timeout, sc); sc->my_autoneg = 1; sc->my_want_auto = 0; return; case MY_FLAG_DELAYTIMEO: callout_stop(&sc->my_autoneg_timer); sc->my_autoneg = 0; break; default: device_printf(sc->my_dev, "invalid autoneg flag: %d\n", flag); return; } if (my_phy_readreg(sc, PHY_BMSR) & PHY_BMSR_AUTONEGCOMP) { if (verbose) device_printf(sc->my_dev, "autoneg complete, "); phy_sts = my_phy_readreg(sc, PHY_BMSR); } else { if (verbose) device_printf(sc->my_dev, "autoneg not complete, "); } media = my_phy_readreg(sc, PHY_BMCR); /* Link is good. Report modes and set duplex mode. */ if (my_phy_readreg(sc, PHY_BMSR) & PHY_BMSR_LINKSTAT) { if (verbose) device_printf(sc->my_dev, "link status good. "); advert = my_phy_readreg(sc, PHY_ANAR); ability = my_phy_readreg(sc, PHY_LPAR); if ((sc->my_pinfo->my_vid == MarvellPHYID0) || (sc->my_pinfo->my_vid == LevelOnePHYID0)) { ability2 = my_phy_readreg(sc, PHY_1000SR); if (ability2 & PHY_1000SR_1000BTXFULL) { advert = 0; ability = 0; /* * this version did not support 1000M, * ifm->ifm_media = * IFM_ETHER|IFM_1000_T|IFM_FDX; */ ifm->ifm_media = IFM_ETHER | IFM_100_TX | IFM_FDX; media &= ~PHY_BMCR_SPEEDSEL; media |= PHY_BMCR_1000; media |= PHY_BMCR_DUPLEX; printf("(full-duplex, 1000Mbps)\n"); } else if (ability2 & PHY_1000SR_1000BTXHALF) { advert = 0; ability = 0; /* * this version did not support 1000M, * ifm->ifm_media = IFM_ETHER|IFM_1000_T; */ ifm->ifm_media = IFM_ETHER | IFM_100_TX; media &= ~PHY_BMCR_SPEEDSEL; media &= ~PHY_BMCR_DUPLEX; media |= PHY_BMCR_1000; printf("(half-duplex, 1000Mbps)\n"); } } if (advert & PHY_ANAR_100BT4 && ability & PHY_ANAR_100BT4) { ifm->ifm_media = IFM_ETHER | IFM_100_T4; media |= PHY_BMCR_SPEEDSEL; media &= ~PHY_BMCR_DUPLEX; printf("(100baseT4)\n"); } else if (advert & PHY_ANAR_100BTXFULL && ability & PHY_ANAR_100BTXFULL) { ifm->ifm_media = IFM_ETHER | IFM_100_TX | IFM_FDX; media |= PHY_BMCR_SPEEDSEL; media |= PHY_BMCR_DUPLEX; printf("(full-duplex, 100Mbps)\n"); } else if (advert & PHY_ANAR_100BTXHALF && ability & PHY_ANAR_100BTXHALF) { ifm->ifm_media = IFM_ETHER | IFM_100_TX | IFM_HDX; media |= PHY_BMCR_SPEEDSEL; media &= ~PHY_BMCR_DUPLEX; printf("(half-duplex, 100Mbps)\n"); } else if (advert & PHY_ANAR_10BTFULL && ability & PHY_ANAR_10BTFULL) { ifm->ifm_media = IFM_ETHER | IFM_10_T | IFM_FDX; media &= ~PHY_BMCR_SPEEDSEL; media |= PHY_BMCR_DUPLEX; printf("(full-duplex, 10Mbps)\n"); } else if (advert) { ifm->ifm_media = IFM_ETHER | IFM_10_T | IFM_HDX; media &= ~PHY_BMCR_SPEEDSEL; media &= ~PHY_BMCR_DUPLEX; printf("(half-duplex, 10Mbps)\n"); } media &= ~PHY_BMCR_AUTONEGENBL; /* Set ASIC's duplex mode to match the PHY. */ my_phy_writereg(sc, PHY_BMCR, media); my_setcfg(sc, media); } else { if (verbose) device_printf(sc->my_dev, "no carrier\n"); } my_init_locked(sc); if (sc->my_tx_pend) { sc->my_autoneg = 0; sc->my_tx_pend = 0; my_start_locked(ifp); } return; } /* * To get PHY ability. */ static void my_getmode_mii(struct my_softc * sc) { u_int16_t bmsr; if_t ifp; MY_LOCK_ASSERT(sc); ifp = sc->my_ifp; bmsr = my_phy_readreg(sc, PHY_BMSR); if (bootverbose) device_printf(sc->my_dev, "PHY status word: %x\n", bmsr); /* fallback */ sc->ifmedia.ifm_media = IFM_ETHER | IFM_10_T | IFM_HDX; if (bmsr & PHY_BMSR_10BTHALF) { if (bootverbose) device_printf(sc->my_dev, "10Mbps half-duplex mode supported\n"); ifmedia_add(&sc->ifmedia, IFM_ETHER | IFM_10_T | IFM_HDX, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER | IFM_10_T, 0, NULL); } if (bmsr & PHY_BMSR_10BTFULL) { if (bootverbose) device_printf(sc->my_dev, "10Mbps full-duplex mode supported\n"); ifmedia_add(&sc->ifmedia, IFM_ETHER | IFM_10_T | IFM_FDX, 0, NULL); sc->ifmedia.ifm_media = IFM_ETHER | IFM_10_T | IFM_FDX; } if (bmsr & PHY_BMSR_100BTXHALF) { if (bootverbose) device_printf(sc->my_dev, "100Mbps half-duplex mode supported\n"); if_setbaudrate(ifp, 100000000); ifmedia_add(&sc->ifmedia, IFM_ETHER | IFM_100_TX, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER | IFM_100_TX | IFM_HDX, 0, NULL); sc->ifmedia.ifm_media = IFM_ETHER | IFM_100_TX | IFM_HDX; } if (bmsr & PHY_BMSR_100BTXFULL) { if (bootverbose) device_printf(sc->my_dev, "100Mbps full-duplex mode supported\n"); if_setbaudrate(ifp, 100000000); ifmedia_add(&sc->ifmedia, IFM_ETHER | IFM_100_TX | IFM_FDX, 0, NULL); sc->ifmedia.ifm_media = IFM_ETHER | IFM_100_TX | IFM_FDX; } /* Some also support 100BaseT4. */ if (bmsr & PHY_BMSR_100BT4) { if (bootverbose) device_printf(sc->my_dev, "100baseT4 mode supported\n"); if_setbaudrate(ifp, 100000000); ifmedia_add(&sc->ifmedia, IFM_ETHER | IFM_100_T4, 0, NULL); sc->ifmedia.ifm_media = IFM_ETHER | IFM_100_T4; #ifdef FORCE_AUTONEG_TFOUR if (bootverbose) device_printf(sc->my_dev, "forcing on autoneg support for BT4\n"); ifmedia_add(&sc->ifmedia, IFM_ETHER | IFM_AUTO, 0 NULL): sc->ifmedia.ifm_media = IFM_ETHER | IFM_AUTO; #endif } #if 0 /* this version did not support 1000M, */ if (sc->my_pinfo->my_vid == MarvellPHYID0) { if (bootverbose) device_printf(sc->my_dev, "1000Mbps half-duplex mode supported\n"); if_setbaudrate(ifp, 1000000000); ifmedia_add(&sc->ifmedia, IFM_ETHER | IFM_1000_T, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER | IFM_1000_T | IFM_HDX, 0, NULL); if (bootverbose) device_printf(sc->my_dev, "1000Mbps full-duplex mode supported\n"); if_setbaudrate(ifp, 1000000000); ifmedia_add(&sc->ifmedia, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL); sc->ifmedia.ifm_media = IFM_ETHER | IFM_1000_T | IFM_FDX; } #endif if (bmsr & PHY_BMSR_CANAUTONEG) { if (bootverbose) device_printf(sc->my_dev, "autoneg supported\n"); ifmedia_add(&sc->ifmedia, IFM_ETHER | IFM_AUTO, 0, NULL); sc->ifmedia.ifm_media = IFM_ETHER | IFM_AUTO; } return; } /* * Set speed and duplex mode. */ static void my_setmode_mii(struct my_softc * sc, int media) { u_int16_t bmcr; MY_LOCK_ASSERT(sc); /* * If an autoneg session is in progress, stop it. */ if (sc->my_autoneg) { device_printf(sc->my_dev, "canceling autoneg session\n"); callout_stop(&sc->my_autoneg_timer); sc->my_autoneg = sc->my_want_auto = 0; bmcr = my_phy_readreg(sc, PHY_BMCR); bmcr &= ~PHY_BMCR_AUTONEGENBL; my_phy_writereg(sc, PHY_BMCR, bmcr); } device_printf(sc->my_dev, "selecting MII, "); bmcr = my_phy_readreg(sc, PHY_BMCR); bmcr &= ~(PHY_BMCR_AUTONEGENBL | PHY_BMCR_SPEEDSEL | PHY_BMCR_1000 | PHY_BMCR_DUPLEX | PHY_BMCR_LOOPBK); #if 0 /* this version did not support 1000M, */ if (IFM_SUBTYPE(media) == IFM_1000_T) { printf("1000Mbps/T4, half-duplex\n"); bmcr &= ~PHY_BMCR_SPEEDSEL; bmcr &= ~PHY_BMCR_DUPLEX; bmcr |= PHY_BMCR_1000; } #endif if (IFM_SUBTYPE(media) == IFM_100_T4) { printf("100Mbps/T4, half-duplex\n"); bmcr |= PHY_BMCR_SPEEDSEL; bmcr &= ~PHY_BMCR_DUPLEX; } if (IFM_SUBTYPE(media) == IFM_100_TX) { printf("100Mbps, "); bmcr |= PHY_BMCR_SPEEDSEL; } if (IFM_SUBTYPE(media) == IFM_10_T) { printf("10Mbps, "); bmcr &= ~PHY_BMCR_SPEEDSEL; } if ((media & IFM_GMASK) == IFM_FDX) { printf("full duplex\n"); bmcr |= PHY_BMCR_DUPLEX; } else { printf("half duplex\n"); bmcr &= ~PHY_BMCR_DUPLEX; } my_phy_writereg(sc, PHY_BMCR, bmcr); my_setcfg(sc, bmcr); return; } /* * The Myson manual states that in order to fiddle with the 'full-duplex' and * '100Mbps' bits in the netconfig register, we first have to put the * transmit and/or receive logic in the idle state. */ static void my_setcfg(struct my_softc * sc, int bmcr) { int i, restart = 0; MY_LOCK_ASSERT(sc); if (CSR_READ_4(sc, MY_TCRRCR) & (MY_TE | MY_RE)) { restart = 1; MY_CLRBIT(sc, MY_TCRRCR, (MY_TE | MY_RE)); for (i = 0; i < MY_TIMEOUT; i++) { DELAY(10); if (!(CSR_READ_4(sc, MY_TCRRCR) & (MY_TXRUN | MY_RXRUN))) break; } if (i == MY_TIMEOUT) device_printf(sc->my_dev, "failed to force tx and rx to idle \n"); } MY_CLRBIT(sc, MY_TCRRCR, MY_PS1000); MY_CLRBIT(sc, MY_TCRRCR, MY_PS10); if (bmcr & PHY_BMCR_1000) MY_SETBIT(sc, MY_TCRRCR, MY_PS1000); else if (!(bmcr & PHY_BMCR_SPEEDSEL)) MY_SETBIT(sc, MY_TCRRCR, MY_PS10); if (bmcr & PHY_BMCR_DUPLEX) MY_SETBIT(sc, MY_TCRRCR, MY_FD); else MY_CLRBIT(sc, MY_TCRRCR, MY_FD); if (restart) MY_SETBIT(sc, MY_TCRRCR, MY_TE | MY_RE); return; } static void my_reset(struct my_softc * sc) { int i; MY_LOCK_ASSERT(sc); MY_SETBIT(sc, MY_BCR, MY_SWR); for (i = 0; i < MY_TIMEOUT; i++) { DELAY(10); if (!(CSR_READ_4(sc, MY_BCR) & MY_SWR)) break; } if (i == MY_TIMEOUT) device_printf(sc->my_dev, "reset never completed!\n"); /* Wait a little while for the chip to get its brains in order. */ DELAY(1000); return; } /* * Probe for a Myson chip. Check the PCI vendor and device IDs against our * list and return a device name if we find a match. */ static int my_probe(device_t dev) { struct my_type *t; t = my_devs; while (t->my_name != NULL) { if ((pci_get_vendor(dev) == t->my_vid) && (pci_get_device(dev) == t->my_did)) { device_set_desc(dev, t->my_name); my_info_tmp = t; return (BUS_PROBE_DEFAULT); } t++; } return (ENXIO); } /* * Attach the interface. Allocate softc structures, do ifmedia setup and * ethernet/BPF attach. */ static int my_attach(device_t dev) { int i; u_char eaddr[ETHER_ADDR_LEN]; u_int32_t iobase; struct my_softc *sc; if_t ifp; int media = IFM_ETHER | IFM_100_TX | IFM_FDX; unsigned int round; caddr_t roundptr; struct my_type *p; u_int16_t phy_vid, phy_did, phy_sts = 0; int rid, error = 0; sc = device_get_softc(dev); sc->my_dev = dev; mtx_init(&sc->my_mtx, device_get_nameunit(dev), MTX_NETWORK_LOCK, MTX_DEF); callout_init_mtx(&sc->my_autoneg_timer, &sc->my_mtx, 0); callout_init_mtx(&sc->my_watchdog, &sc->my_mtx, 0); /* * Map control/status registers. */ pci_enable_busmaster(dev); if (my_info_tmp->my_did == MTD800ID) { iobase = pci_read_config(dev, MY_PCI_LOIO, 4); if (iobase & 0x300) MY_USEIOSPACE = 0; } rid = MY_RID; sc->my_res = bus_alloc_resource_any(dev, MY_RES, &rid, RF_ACTIVE); if (sc->my_res == NULL) { device_printf(dev, "couldn't map ports/memory\n"); error = ENXIO; goto destroy_mutex; } sc->my_btag = rman_get_bustag(sc->my_res); sc->my_bhandle = rman_get_bushandle(sc->my_res); rid = 0; sc->my_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (sc->my_irq == NULL) { device_printf(dev, "couldn't map interrupt\n"); error = ENXIO; goto release_io; } sc->my_info = my_info_tmp; /* Reset the adapter. */ MY_LOCK(sc); my_reset(sc); MY_UNLOCK(sc); /* * Get station address */ for (i = 0; i < ETHER_ADDR_LEN; ++i) eaddr[i] = CSR_READ_1(sc, MY_PAR0 + i); sc->my_ldata_ptr = malloc(sizeof(struct my_list_data) + 8, M_DEVBUF, M_NOWAIT); if (sc->my_ldata_ptr == NULL) { device_printf(dev, "no memory for list buffers!\n"); error = ENXIO; goto release_irq; } sc->my_ldata = (struct my_list_data *) sc->my_ldata_ptr; round = (uintptr_t)sc->my_ldata_ptr & 0xF; roundptr = sc->my_ldata_ptr; for (i = 0; i < 8; i++) { if (round % 8) { round++; roundptr++; } else break; } sc->my_ldata = (struct my_list_data *) roundptr; bzero(sc->my_ldata, sizeof(struct my_list_data)); ifp = sc->my_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not if_alloc()\n"); error = ENOSPC; goto free_ldata; } if_setsoftc(ifp, sc); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); if_setioctlfn(ifp, my_ioctl); if_setstartfn(ifp, my_start); if_setinitfn(ifp, my_init); if_setbaudrate(ifp, 10000000); if_setsendqlen(ifp, ifqmaxlen); if_setsendqready(ifp); if (sc->my_info->my_did == MTD803ID) sc->my_pinfo = my_phys; else { if (bootverbose) device_printf(dev, "probing for a PHY\n"); MY_LOCK(sc); for (i = MY_PHYADDR_MIN; i < MY_PHYADDR_MAX + 1; i++) { if (bootverbose) device_printf(dev, "checking address: %d\n", i); sc->my_phy_addr = i; phy_sts = my_phy_readreg(sc, PHY_BMSR); if ((phy_sts != 0) && (phy_sts != 0xffff)) break; else phy_sts = 0; } if (phy_sts) { phy_vid = my_phy_readreg(sc, PHY_VENID); phy_did = my_phy_readreg(sc, PHY_DEVID); if (bootverbose) { device_printf(dev, "found PHY at address %d, ", sc->my_phy_addr); printf("vendor id: %x device id: %x\n", phy_vid, phy_did); } p = my_phys; while (p->my_vid) { if (phy_vid == p->my_vid) { sc->my_pinfo = p; break; } p++; } if (sc->my_pinfo == NULL) sc->my_pinfo = &my_phys[PHY_UNKNOWN]; if (bootverbose) device_printf(dev, "PHY type: %s\n", sc->my_pinfo->my_name); } else { MY_UNLOCK(sc); device_printf(dev, "MII without any phy!\n"); error = ENXIO; goto free_if; } MY_UNLOCK(sc); } /* Do ifmedia setup. */ ifmedia_init(&sc->ifmedia, 0, my_ifmedia_upd, my_ifmedia_sts); MY_LOCK(sc); my_getmode_mii(sc); my_autoneg_mii(sc, MY_FLAG_FORCEDELAY, 1); media = sc->ifmedia.ifm_media; my_stop(sc); MY_UNLOCK(sc); ifmedia_set(&sc->ifmedia, media); ether_ifattach(ifp, eaddr); error = bus_setup_intr(dev, sc->my_irq, INTR_TYPE_NET | INTR_MPSAFE, NULL, my_intr, sc, &sc->my_intrhand); if (error) { device_printf(dev, "couldn't set up irq\n"); goto detach_if; } return (0); detach_if: ether_ifdetach(ifp); free_if: if_free(ifp); free_ldata: free(sc->my_ldata_ptr, M_DEVBUF); release_irq: bus_release_resource(dev, SYS_RES_IRQ, 0, sc->my_irq); release_io: bus_release_resource(dev, MY_RES, MY_RID, sc->my_res); destroy_mutex: mtx_destroy(&sc->my_mtx); return (error); } static int my_detach(device_t dev) { struct my_softc *sc; if_t ifp; sc = device_get_softc(dev); ifp = sc->my_ifp; ether_ifdetach(ifp); MY_LOCK(sc); my_stop(sc); MY_UNLOCK(sc); bus_teardown_intr(dev, sc->my_irq, sc->my_intrhand); callout_drain(&sc->my_watchdog); callout_drain(&sc->my_autoneg_timer); if_free(ifp); free(sc->my_ldata_ptr, M_DEVBUF); bus_release_resource(dev, SYS_RES_IRQ, 0, sc->my_irq); bus_release_resource(dev, MY_RES, MY_RID, sc->my_res); mtx_destroy(&sc->my_mtx); return (0); } /* * Initialize the transmit descriptors. */ static int my_list_tx_init(struct my_softc * sc) { struct my_chain_data *cd; struct my_list_data *ld; int i; MY_LOCK_ASSERT(sc); cd = &sc->my_cdata; ld = sc->my_ldata; for (i = 0; i < MY_TX_LIST_CNT; i++) { cd->my_tx_chain[i].my_ptr = &ld->my_tx_list[i]; if (i == (MY_TX_LIST_CNT - 1)) cd->my_tx_chain[i].my_nextdesc = &cd->my_tx_chain[0]; else cd->my_tx_chain[i].my_nextdesc = &cd->my_tx_chain[i + 1]; } cd->my_tx_free = &cd->my_tx_chain[0]; cd->my_tx_tail = cd->my_tx_head = NULL; return (0); } /* * Initialize the RX descriptors and allocate mbufs for them. Note that we * arrange the descriptors in a closed ring, so that the last descriptor * points back to the first. */ static int my_list_rx_init(struct my_softc * sc) { struct my_chain_data *cd; struct my_list_data *ld; int i; MY_LOCK_ASSERT(sc); cd = &sc->my_cdata; ld = sc->my_ldata; for (i = 0; i < MY_RX_LIST_CNT; i++) { cd->my_rx_chain[i].my_ptr = (struct my_desc *) & ld->my_rx_list[i]; if (my_newbuf(sc, &cd->my_rx_chain[i]) == ENOBUFS) { MY_UNLOCK(sc); return (ENOBUFS); } if (i == (MY_RX_LIST_CNT - 1)) { cd->my_rx_chain[i].my_nextdesc = &cd->my_rx_chain[0]; ld->my_rx_list[i].my_next = vtophys(&ld->my_rx_list[0]); } else { cd->my_rx_chain[i].my_nextdesc = &cd->my_rx_chain[i + 1]; ld->my_rx_list[i].my_next = vtophys(&ld->my_rx_list[i + 1]); } } cd->my_rx_head = &cd->my_rx_chain[0]; return (0); } /* * Initialize an RX descriptor and attach an MBUF cluster. */ static int my_newbuf(struct my_softc * sc, struct my_chain_onefrag * c) { struct mbuf *m_new = NULL; MY_LOCK_ASSERT(sc); MGETHDR(m_new, M_NOWAIT, MT_DATA); if (m_new == NULL) { device_printf(sc->my_dev, "no memory for rx list -- packet dropped!\n"); return (ENOBUFS); } if (!(MCLGET(m_new, M_NOWAIT))) { device_printf(sc->my_dev, "no memory for rx list -- packet dropped!\n"); m_freem(m_new); return (ENOBUFS); } c->my_mbuf = m_new; c->my_ptr->my_data = vtophys(mtod(m_new, caddr_t)); c->my_ptr->my_ctl = (MCLBYTES - 1) << MY_RBSShift; c->my_ptr->my_status = MY_OWNByNIC; return (0); } /* * A frame has been uploaded: pass the resulting mbuf chain up to the higher * level protocols. */ static void my_rxeof(struct my_softc * sc) { struct ether_header *eh; struct mbuf *m; if_t ifp; struct my_chain_onefrag *cur_rx; int total_len = 0; u_int32_t rxstat; MY_LOCK_ASSERT(sc); ifp = sc->my_ifp; while (!((rxstat = sc->my_cdata.my_rx_head->my_ptr->my_status) & MY_OWNByNIC)) { cur_rx = sc->my_cdata.my_rx_head; sc->my_cdata.my_rx_head = cur_rx->my_nextdesc; if (rxstat & MY_ES) { /* error summary: give up this rx pkt */ if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); cur_rx->my_ptr->my_status = MY_OWNByNIC; continue; } /* No errors; receive the packet. */ total_len = (rxstat & MY_FLNGMASK) >> MY_FLNGShift; total_len -= ETHER_CRC_LEN; if (total_len < MINCLSIZE) { m = m_devget(mtod(cur_rx->my_mbuf, char *), total_len, 0, ifp, NULL); cur_rx->my_ptr->my_status = MY_OWNByNIC; if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); continue; } } else { m = cur_rx->my_mbuf; /* * Try to conjure up a new mbuf cluster. If that * fails, it means we have an out of memory condition * and should leave the buffer in place and continue. * This will result in a lost packet, but there's * little else we can do in this situation. */ if (my_newbuf(sc, cur_rx) == ENOBUFS) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); cur_rx->my_ptr->my_status = MY_OWNByNIC; continue; } m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = m->m_len = total_len; } if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); eh = mtod(m, struct ether_header *); #if NBPFILTER > 0 /* * Handle BPF listeners. Let the BPF user see the packet, but * don't pass it up to the ether_input() layer unless it's a * broadcast packet, multicast packet, matches our ethernet * address or the interface is in promiscuous mode. */ - if (bpf_peers_present(if_getbpf(ifp))) { + if (bpf_peers_present_if(ifp)) { bpf_mtap_if(ifp, m); if (if_getflags(ifp) & IFF_PROMISC && (bcmp(eh->ether_dhost, if_getlladdr(sc->my_ifp), ETHER_ADDR_LEN) && (eh->ether_dhost[0] & 1) == 0)) { m_freem(m); continue; } } #endif MY_UNLOCK(sc); if_input(ifp, m); MY_LOCK(sc); } return; } /* * A frame was downloaded to the chip. It's safe for us to clean up the list * buffers. */ static void my_txeof(struct my_softc * sc) { struct my_chain *cur_tx; if_t ifp; MY_LOCK_ASSERT(sc); ifp = sc->my_ifp; /* Clear the timeout timer. */ sc->my_timer = 0; if (sc->my_cdata.my_tx_head == NULL) { return; } /* * Go through our tx list and free mbufs for those frames that have * been transmitted. */ while (sc->my_cdata.my_tx_head->my_mbuf != NULL) { u_int32_t txstat; cur_tx = sc->my_cdata.my_tx_head; txstat = MY_TXSTATUS(cur_tx); if ((txstat & MY_OWNByNIC) || txstat == MY_UNSENT) break; if (!(CSR_READ_4(sc, MY_TCRRCR) & MY_Enhanced)) { if (txstat & MY_TXERR) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); if (txstat & MY_EC) /* excessive collision */ if_inc_counter(ifp, IFCOUNTER_COLLISIONS, 1); if (txstat & MY_LC) /* late collision */ if_inc_counter(ifp, IFCOUNTER_COLLISIONS, 1); } if_inc_counter(ifp, IFCOUNTER_COLLISIONS, (txstat & MY_NCRMASK) >> MY_NCRShift); } if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); m_freem(cur_tx->my_mbuf); cur_tx->my_mbuf = NULL; if (sc->my_cdata.my_tx_head == sc->my_cdata.my_tx_tail) { sc->my_cdata.my_tx_head = NULL; sc->my_cdata.my_tx_tail = NULL; break; } sc->my_cdata.my_tx_head = cur_tx->my_nextdesc; } if (CSR_READ_4(sc, MY_TCRRCR) & MY_Enhanced) { if_inc_counter(ifp, IFCOUNTER_COLLISIONS, (CSR_READ_4(sc, MY_TSR) & MY_NCRMask)); } return; } /* * TX 'end of channel' interrupt handler. */ static void my_txeoc(struct my_softc * sc) { if_t ifp; MY_LOCK_ASSERT(sc); ifp = sc->my_ifp; sc->my_timer = 0; if (sc->my_cdata.my_tx_head == NULL) { if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); sc->my_cdata.my_tx_tail = NULL; if (sc->my_want_auto) my_autoneg_mii(sc, MY_FLAG_SCHEDDELAY, 1); } else { if (MY_TXOWN(sc->my_cdata.my_tx_head) == MY_UNSENT) { MY_TXOWN(sc->my_cdata.my_tx_head) = MY_OWNByNIC; sc->my_timer = 5; CSR_WRITE_4(sc, MY_TXPDR, 0xFFFFFFFF); } } return; } static void my_intr(void *arg) { struct my_softc *sc; if_t ifp; u_int32_t status; sc = arg; MY_LOCK(sc); ifp = sc->my_ifp; if (!(if_getflags(ifp) & IFF_UP)) { MY_UNLOCK(sc); return; } /* Disable interrupts. */ CSR_WRITE_4(sc, MY_IMR, 0x00000000); for (;;) { status = CSR_READ_4(sc, MY_ISR); status &= MY_INTRS; if (status) CSR_WRITE_4(sc, MY_ISR, status); else break; if (status & MY_RI) /* receive interrupt */ my_rxeof(sc); if ((status & MY_RBU) || (status & MY_RxErr)) { /* rx buffer unavailable or rx error */ if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); #ifdef foo my_stop(sc); my_reset(sc); my_init_locked(sc); #endif } if (status & MY_TI) /* tx interrupt */ my_txeof(sc); if (status & MY_ETI) /* tx early interrupt */ my_txeof(sc); if (status & MY_TBU) /* tx buffer unavailable */ my_txeoc(sc); #if 0 /* 90/1/18 delete */ if (status & MY_FBE) { my_reset(sc); my_init_locked(sc); } #endif } /* Re-enable interrupts. */ CSR_WRITE_4(sc, MY_IMR, MY_INTRS); if (!if_sendq_empty(ifp)) my_start_locked(ifp); MY_UNLOCK(sc); return; } /* * Encapsulate an mbuf chain in a descriptor by coupling the mbuf data * pointers to the fragment pointers. */ static int my_encap(struct my_softc * sc, struct my_chain * c, struct mbuf * m_head) { struct my_desc *f = NULL; int total_len; struct mbuf *m, *m_new = NULL; MY_LOCK_ASSERT(sc); /* calculate the total tx pkt length */ total_len = 0; for (m = m_head; m != NULL; m = m->m_next) total_len += m->m_len; /* * Start packing the mbufs in this chain into the fragment pointers. * Stop when we run out of fragments or hit the end of the mbuf * chain. */ m = m_head; MGETHDR(m_new, M_NOWAIT, MT_DATA); if (m_new == NULL) { device_printf(sc->my_dev, "no memory for tx list"); return (1); } if (m_head->m_pkthdr.len > MHLEN) { if (!(MCLGET(m_new, M_NOWAIT))) { m_freem(m_new); device_printf(sc->my_dev, "no memory for tx list"); return (1); } } m_copydata(m_head, 0, m_head->m_pkthdr.len, mtod(m_new, caddr_t)); m_new->m_pkthdr.len = m_new->m_len = m_head->m_pkthdr.len; m_freem(m_head); m_head = m_new; f = &c->my_ptr->my_frag[0]; f->my_status = 0; f->my_data = vtophys(mtod(m_new, caddr_t)); total_len = m_new->m_len; f->my_ctl = MY_TXFD | MY_TXLD | MY_CRCEnable | MY_PADEnable; f->my_ctl |= total_len << MY_PKTShift; /* pkt size */ f->my_ctl |= total_len; /* buffer size */ /* 89/12/29 add, for mtd891 *//* [ 89? ] */ if (sc->my_info->my_did == MTD891ID) f->my_ctl |= MY_ETIControl | MY_RetryTxLC; c->my_mbuf = m_head; c->my_lastdesc = 0; MY_TXNEXT(c) = vtophys(&c->my_nextdesc->my_ptr->my_frag[0]); return (0); } /* * Main transmit routine. To avoid having to do mbuf copies, we put pointers * to the mbuf data regions directly in the transmit lists. We also save a * copy of the pointers since the transmit list fragment pointers are * physical addresses. */ static void my_start(if_t ifp) { struct my_softc *sc; sc = if_getsoftc(ifp); MY_LOCK(sc); my_start_locked(ifp); MY_UNLOCK(sc); } static void my_start_locked(if_t ifp) { struct my_softc *sc; struct mbuf *m_head = NULL; struct my_chain *cur_tx = NULL, *start_tx; sc = if_getsoftc(ifp); MY_LOCK_ASSERT(sc); if (sc->my_autoneg) { sc->my_tx_pend = 1; return; } /* * Check for an available queue slot. If there are none, punt. */ if (sc->my_cdata.my_tx_free->my_mbuf != NULL) { if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); return; } start_tx = sc->my_cdata.my_tx_free; while (sc->my_cdata.my_tx_free->my_mbuf == NULL) { m_head = if_dequeue(ifp); if (m_head == NULL) break; /* Pick a descriptor off the free list. */ cur_tx = sc->my_cdata.my_tx_free; sc->my_cdata.my_tx_free = cur_tx->my_nextdesc; /* Pack the data into the descriptor. */ my_encap(sc, cur_tx, m_head); if (cur_tx != start_tx) MY_TXOWN(cur_tx) = MY_OWNByNIC; #if NBPFILTER > 0 /* * If there's a BPF listener, bounce a copy of this frame to * him. */ BPF_MTAP(ifp, cur_tx->my_mbuf); #endif } /* * If there are no packets queued, bail. */ if (cur_tx == NULL) { return; } /* * Place the request for the upload interrupt in the last descriptor * in the chain. This way, if we're chaining several packets at once, * we'll only get an interrupt once for the whole chain rather than * once for each packet. */ MY_TXCTL(cur_tx) |= MY_TXIC; cur_tx->my_ptr->my_frag[0].my_ctl |= MY_TXIC; sc->my_cdata.my_tx_tail = cur_tx; if (sc->my_cdata.my_tx_head == NULL) sc->my_cdata.my_tx_head = start_tx; MY_TXOWN(start_tx) = MY_OWNByNIC; CSR_WRITE_4(sc, MY_TXPDR, 0xFFFFFFFF); /* tx polling demand */ /* * Set a timeout in case the chip goes out to lunch. */ sc->my_timer = 5; return; } static void my_init(void *xsc) { struct my_softc *sc = xsc; MY_LOCK(sc); my_init_locked(sc); MY_UNLOCK(sc); } static void my_init_locked(struct my_softc *sc) { if_t ifp = sc->my_ifp; u_int16_t phy_bmcr = 0; MY_LOCK_ASSERT(sc); if (sc->my_autoneg) { return; } if (sc->my_pinfo != NULL) phy_bmcr = my_phy_readreg(sc, PHY_BMCR); /* * Cancel pending I/O and free all RX/TX buffers. */ my_stop(sc); my_reset(sc); /* * Set cache alignment and burst length. */ #if 0 /* 89/9/1 modify, */ CSR_WRITE_4(sc, MY_BCR, MY_RPBLE512); CSR_WRITE_4(sc, MY_TCRRCR, MY_TFTSF); #endif CSR_WRITE_4(sc, MY_BCR, MY_PBL8); CSR_WRITE_4(sc, MY_TCRRCR, MY_TFTSF | MY_RBLEN | MY_RPBLE512); /* * 89/12/29 add, for mtd891, */ if (sc->my_info->my_did == MTD891ID) { MY_SETBIT(sc, MY_BCR, MY_PROG); MY_SETBIT(sc, MY_TCRRCR, MY_Enhanced); } my_setcfg(sc, phy_bmcr); /* Init circular RX list. */ if (my_list_rx_init(sc) == ENOBUFS) { device_printf(sc->my_dev, "init failed: no memory for rx buffers\n"); my_stop(sc); return; } /* Init TX descriptors. */ my_list_tx_init(sc); /* If we want promiscuous mode, set the allframes bit. */ if (if_getflags(ifp) & IFF_PROMISC) MY_SETBIT(sc, MY_TCRRCR, MY_PROM); else MY_CLRBIT(sc, MY_TCRRCR, MY_PROM); /* * Set capture broadcast bit to capture broadcast frames. */ if (if_getflags(ifp) & IFF_BROADCAST) MY_SETBIT(sc, MY_TCRRCR, MY_AB); else MY_CLRBIT(sc, MY_TCRRCR, MY_AB); /* * Program the multicast filter, if necessary. */ my_setmulti(sc); /* * Load the address of the RX list. */ MY_CLRBIT(sc, MY_TCRRCR, MY_RE); CSR_WRITE_4(sc, MY_RXLBA, vtophys(&sc->my_ldata->my_rx_list[0])); /* * Enable interrupts. */ CSR_WRITE_4(sc, MY_IMR, MY_INTRS); CSR_WRITE_4(sc, MY_ISR, 0xFFFFFFFF); /* Enable receiver and transmitter. */ MY_SETBIT(sc, MY_TCRRCR, MY_RE); MY_CLRBIT(sc, MY_TCRRCR, MY_TE); CSR_WRITE_4(sc, MY_TXLBA, vtophys(&sc->my_ldata->my_tx_list[0])); MY_SETBIT(sc, MY_TCRRCR, MY_TE); /* Restore state of BMCR */ if (sc->my_pinfo != NULL) my_phy_writereg(sc, PHY_BMCR, phy_bmcr); if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); callout_reset(&sc->my_watchdog, hz, my_watchdog, sc); return; } /* * Set media options. */ static int my_ifmedia_upd(if_t ifp) { struct my_softc *sc; struct ifmedia *ifm; sc = if_getsoftc(ifp); MY_LOCK(sc); ifm = &sc->ifmedia; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) { MY_UNLOCK(sc); return (EINVAL); } if (IFM_SUBTYPE(ifm->ifm_media) == IFM_AUTO) my_autoneg_mii(sc, MY_FLAG_SCHEDDELAY, 1); else my_setmode_mii(sc, ifm->ifm_media); MY_UNLOCK(sc); return (0); } /* * Report current media status. */ static void my_ifmedia_sts(if_t ifp, struct ifmediareq * ifmr) { struct my_softc *sc; u_int16_t advert = 0, ability = 0; sc = if_getsoftc(ifp); MY_LOCK(sc); ifmr->ifm_active = IFM_ETHER; if (!(my_phy_readreg(sc, PHY_BMCR) & PHY_BMCR_AUTONEGENBL)) { #if 0 /* this version did not support 1000M, */ if (my_phy_readreg(sc, PHY_BMCR) & PHY_BMCR_1000) ifmr->ifm_active = IFM_ETHER | IFM_1000TX; #endif if (my_phy_readreg(sc, PHY_BMCR) & PHY_BMCR_SPEEDSEL) ifmr->ifm_active = IFM_ETHER | IFM_100_TX; else ifmr->ifm_active = IFM_ETHER | IFM_10_T; if (my_phy_readreg(sc, PHY_BMCR) & PHY_BMCR_DUPLEX) ifmr->ifm_active |= IFM_FDX; else ifmr->ifm_active |= IFM_HDX; MY_UNLOCK(sc); return; } ability = my_phy_readreg(sc, PHY_LPAR); advert = my_phy_readreg(sc, PHY_ANAR); #if 0 /* this version did not support 1000M, */ if (sc->my_pinfo->my_vid = MarvellPHYID0) { ability2 = my_phy_readreg(sc, PHY_1000SR); if (ability2 & PHY_1000SR_1000BTXFULL) { advert = 0; ability = 0; ifmr->ifm_active = IFM_ETHER|IFM_1000_T|IFM_FDX; } else if (ability & PHY_1000SR_1000BTXHALF) { advert = 0; ability = 0; ifmr->ifm_active = IFM_ETHER|IFM_1000_T|IFM_HDX; } } #endif if (advert & PHY_ANAR_100BT4 && ability & PHY_ANAR_100BT4) ifmr->ifm_active = IFM_ETHER | IFM_100_T4; else if (advert & PHY_ANAR_100BTXFULL && ability & PHY_ANAR_100BTXFULL) ifmr->ifm_active = IFM_ETHER | IFM_100_TX | IFM_FDX; else if (advert & PHY_ANAR_100BTXHALF && ability & PHY_ANAR_100BTXHALF) ifmr->ifm_active = IFM_ETHER | IFM_100_TX | IFM_HDX; else if (advert & PHY_ANAR_10BTFULL && ability & PHY_ANAR_10BTFULL) ifmr->ifm_active = IFM_ETHER | IFM_10_T | IFM_FDX; else if (advert & PHY_ANAR_10BTHALF && ability & PHY_ANAR_10BTHALF) ifmr->ifm_active = IFM_ETHER | IFM_10_T | IFM_HDX; MY_UNLOCK(sc); return; } static int my_ioctl(if_t ifp, u_long command, caddr_t data) { struct my_softc *sc = if_getsoftc(ifp); struct ifreq *ifr = (struct ifreq *) data; int error; switch (command) { case SIOCSIFFLAGS: MY_LOCK(sc); if (if_getflags(ifp) & IFF_UP) my_init_locked(sc); else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) my_stop(sc); MY_UNLOCK(sc); error = 0; break; case SIOCADDMULTI: case SIOCDELMULTI: MY_LOCK(sc); my_setmulti(sc); MY_UNLOCK(sc); error = 0; break; case SIOCGIFMEDIA: case SIOCSIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->ifmedia, command); break; default: error = ether_ioctl(ifp, command, data); break; } return (error); } static void my_watchdog(void *arg) { struct my_softc *sc; if_t ifp; sc = arg; MY_LOCK_ASSERT(sc); callout_reset(&sc->my_watchdog, hz, my_watchdog, sc); if (sc->my_timer == 0 || --sc->my_timer > 0) return; ifp = sc->my_ifp; if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); if_printf(ifp, "watchdog timeout\n"); if (!(my_phy_readreg(sc, PHY_BMSR) & PHY_BMSR_LINKSTAT)) if_printf(ifp, "no carrier - transceiver cable problem?\n"); my_stop(sc); my_reset(sc); my_init_locked(sc); if (!if_sendq_empty(ifp)) my_start_locked(ifp); } /* * Stop the adapter and free any mbufs allocated to the RX and TX lists. */ static void my_stop(struct my_softc * sc) { int i; if_t ifp; MY_LOCK_ASSERT(sc); ifp = sc->my_ifp; callout_stop(&sc->my_autoneg_timer); callout_stop(&sc->my_watchdog); MY_CLRBIT(sc, MY_TCRRCR, (MY_RE | MY_TE)); CSR_WRITE_4(sc, MY_IMR, 0x00000000); CSR_WRITE_4(sc, MY_TXLBA, 0x00000000); CSR_WRITE_4(sc, MY_RXLBA, 0x00000000); /* * Free data in the RX lists. */ for (i = 0; i < MY_RX_LIST_CNT; i++) { if (sc->my_cdata.my_rx_chain[i].my_mbuf != NULL) { m_freem(sc->my_cdata.my_rx_chain[i].my_mbuf); sc->my_cdata.my_rx_chain[i].my_mbuf = NULL; } } bzero((char *)&sc->my_ldata->my_rx_list, sizeof(sc->my_ldata->my_rx_list)); /* * Free the TX list buffers. */ for (i = 0; i < MY_TX_LIST_CNT; i++) { if (sc->my_cdata.my_tx_chain[i].my_mbuf != NULL) { m_freem(sc->my_cdata.my_tx_chain[i].my_mbuf); sc->my_cdata.my_tx_chain[i].my_mbuf = NULL; } } bzero((char *)&sc->my_ldata->my_tx_list, sizeof(sc->my_ldata->my_tx_list)); if_setdrvflagbits(ifp, 0, (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)); return; } /* * Stop all chip I/O so that the kernel's probe routines don't get confused * by errant DMAs when rebooting. */ static int my_shutdown(device_t dev) { struct my_softc *sc; sc = device_get_softc(dev); MY_LOCK(sc); my_stop(sc); MY_UNLOCK(sc); return 0; } diff --git a/sys/dev/usb/usb_pf.c b/sys/dev/usb/usb_pf.c index 43e819684857..4da59419a7c6 100644 --- a/sys/dev/usb/usb_pf.c +++ b/sys/dev/usb/usb_pf.c @@ -1,539 +1,537 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence * Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifdef USB_GLOBAL_INCLUDE_FILE #include USB_GLOBAL_INCLUDE_FILE #else #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* USB_GLOBAL_INCLUDE_FILE */ static void usbpf_init(void *); static void usbpf_uninit(void *); static int usbpf_ioctl(if_t, u_long, caddr_t); static int usbpf_clone_match(struct if_clone *, const char *); static int usbpf_clone_create(struct if_clone *, char *, size_t, struct ifc_data *, if_t *); static int usbpf_clone_destroy(struct if_clone *, if_t, uint32_t); static struct usb_bus *usbpf_ifname2ubus(const char *); static uint32_t usbpf_aggregate_xferflags(struct usb_xfer_flags *); static uint32_t usbpf_aggregate_status(struct usb_xfer_flags_int *); static int usbpf_xfer_frame_is_read(struct usb_xfer *, uint32_t); static uint32_t usbpf_xfer_precompute_size(struct usb_xfer *, int); static struct if_clone *usbpf_cloner; static const char usbusname[] = "usbus"; SYSINIT(usbpf_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, usbpf_init, NULL); SYSUNINIT(usbpf_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, usbpf_uninit, NULL); static void usbpf_init(void *arg) { struct if_clone_addreq req = { .match_f = usbpf_clone_match, .create_f = usbpf_clone_create, .destroy_f = usbpf_clone_destroy, }; usbpf_cloner = ifc_attach_cloner(usbusname, &req); } static void usbpf_uninit(void *arg) { int devlcnt; device_t *devlp; devclass_t dc; struct usb_bus *ubus; int error; int i; if_clone_detach(usbpf_cloner); dc = devclass_find(usbusname); if (dc == NULL) return; error = devclass_get_devices(dc, &devlp, &devlcnt); if (error) return; for (i = 0; i < devlcnt; i++) { ubus = device_get_softc(devlp[i]); if (ubus != NULL && ubus->ifp != NULL) usbpf_clone_destroy(usbpf_cloner, ubus->ifp, 0); } free(devlp, M_TEMP); } static int usbpf_ioctl(if_t ifp, u_long cmd, caddr_t data) { /* No configuration allowed. */ return (EINVAL); } static struct usb_bus * usbpf_ifname2ubus(const char *ifname) { device_t dev; devclass_t dc; int unit; int error; if (strncmp(ifname, usbusname, sizeof(usbusname) - 1) != 0) return (NULL); error = ifc_name2unit(ifname, &unit); if (error || unit < 0) return (NULL); dc = devclass_find(usbusname); if (dc == NULL) return (NULL); dev = devclass_get_device(dc, unit); if (dev == NULL) return (NULL); return (device_get_softc(dev)); } static int usbpf_clone_match(struct if_clone *ifc, const char *name) { struct usb_bus *ubus; ubus = usbpf_ifname2ubus(name); if (ubus == NULL) return (0); if (ubus->ifp != NULL) return (0); return (1); } static int usbpf_clone_create(struct if_clone *ifc, char *name, size_t len, struct ifc_data *ifd, if_t *ifpp) { int error; int unit; if_t ifp; struct usb_bus *ubus; error = ifc_name2unit(name, &unit); if (error) return (error); if (unit < 0) return (EINVAL); ubus = usbpf_ifname2ubus(name); if (ubus == NULL) return (1); if (ubus->ifp != NULL) return (1); error = ifc_alloc_unit(ifc, &unit); if (error) { device_printf(ubus->parent, "usbpf: Could not allocate " "instance\n"); return (error); } ifp = ubus->ifp = if_alloc(IFT_USB); if (ifp == NULL) { ifc_free_unit(ifc, unit); device_printf(ubus->parent, "usbpf: Could not allocate " "instance\n"); return (ENOSPC); } if_setsoftc(ifp, ubus); if_initname(ifp, usbusname, unit); if_setname(ifp, name); if_setioctlfn(ifp, usbpf_ioctl); if_attach(ifp); if_setflagbits(ifp, IFF_UP, 0); rt_ifmsg(ifp, IFF_UP); /* * XXX According to the specification of DLT_USB, it indicates * packets beginning with USB setup header. But not sure all * packets would be. */ bpfattach(ifp, DLT_USB, USBPF_HDR_LEN); *ifpp = ifp; return (0); } static int usbpf_clone_destroy(struct if_clone *ifc, if_t ifp, uint32_t flags) { struct usb_bus *ubus; int unit; ubus = if_getsoftc(ifp); unit = if_getdunit(ifp); /* * Lock USB before clearing the "ifp" pointer, to avoid * clearing the pointer in the middle of a TAP operation: */ USB_BUS_LOCK(ubus); ubus->ifp = NULL; USB_BUS_UNLOCK(ubus); bpfdetach(ifp); if_detach(ifp); if_free(ifp); ifc_free_unit(ifc, unit); return (0); } void usbpf_attach(struct usb_bus *ubus) { if (bootverbose) device_printf(ubus->parent, "usbpf: Attached\n"); } void usbpf_detach(struct usb_bus *ubus) { if (ubus->ifp != NULL) usbpf_clone_destroy(usbpf_cloner, ubus->ifp, 0); if (bootverbose) device_printf(ubus->parent, "usbpf: Detached\n"); } static uint32_t usbpf_aggregate_xferflags(struct usb_xfer_flags *flags) { uint32_t val = 0; if (flags->force_short_xfer == 1) val |= USBPF_FLAG_FORCE_SHORT_XFER; if (flags->short_xfer_ok == 1) val |= USBPF_FLAG_SHORT_XFER_OK; if (flags->short_frames_ok == 1) val |= USBPF_FLAG_SHORT_FRAMES_OK; if (flags->pipe_bof == 1) val |= USBPF_FLAG_PIPE_BOF; if (flags->proxy_buffer == 1) val |= USBPF_FLAG_PROXY_BUFFER; if (flags->ext_buffer == 1) val |= USBPF_FLAG_EXT_BUFFER; if (flags->manual_status == 1) val |= USBPF_FLAG_MANUAL_STATUS; if (flags->no_pipe_ok == 1) val |= USBPF_FLAG_NO_PIPE_OK; if (flags->stall_pipe == 1) val |= USBPF_FLAG_STALL_PIPE; return (val); } static uint32_t usbpf_aggregate_status(struct usb_xfer_flags_int *flags) { uint32_t val = 0; if (flags->open == 1) val |= USBPF_STATUS_OPEN; if (flags->transferring == 1) val |= USBPF_STATUS_TRANSFERRING; if (flags->did_dma_delay == 1) val |= USBPF_STATUS_DID_DMA_DELAY; if (flags->did_close == 1) val |= USBPF_STATUS_DID_CLOSE; if (flags->draining == 1) val |= USBPF_STATUS_DRAINING; if (flags->started == 1) val |= USBPF_STATUS_STARTED; if (flags->bandwidth_reclaimed == 1) val |= USBPF_STATUS_BW_RECLAIMED; if (flags->control_xfr == 1) val |= USBPF_STATUS_CONTROL_XFR; if (flags->control_hdr == 1) val |= USBPF_STATUS_CONTROL_HDR; if (flags->control_act == 1) val |= USBPF_STATUS_CONTROL_ACT; if (flags->control_stall == 1) val |= USBPF_STATUS_CONTROL_STALL; if (flags->short_frames_ok == 1) val |= USBPF_STATUS_SHORT_FRAMES_OK; if (flags->short_xfer_ok == 1) val |= USBPF_STATUS_SHORT_XFER_OK; #if USB_HAVE_BUSDMA if (flags->bdma_enable == 1) val |= USBPF_STATUS_BDMA_ENABLE; if (flags->bdma_no_post_sync == 1) val |= USBPF_STATUS_BDMA_NO_POST_SYNC; if (flags->bdma_setup == 1) val |= USBPF_STATUS_BDMA_SETUP; #endif if (flags->isochronous_xfr == 1) val |= USBPF_STATUS_ISOCHRONOUS_XFR; if (flags->curr_dma_set == 1) val |= USBPF_STATUS_CURR_DMA_SET; if (flags->can_cancel_immed == 1) val |= USBPF_STATUS_CAN_CANCEL_IMMED; if (flags->doing_callback == 1) val |= USBPF_STATUS_DOING_CALLBACK; return (val); } static int usbpf_xfer_frame_is_read(struct usb_xfer *xfer, uint32_t frame) { int isread; if ((frame == 0) && (xfer->flags_int.control_xfr != 0) && (xfer->flags_int.control_hdr != 0)) { /* special case */ if (xfer->flags_int.usb_mode == USB_MODE_DEVICE) { /* The device controller writes to memory */ isread = 1; } else { /* The host controller reads from memory */ isread = 0; } } else { isread = USB_GET_DATA_ISREAD(xfer); } return (isread); } static uint32_t usbpf_xfer_precompute_size(struct usb_xfer *xfer, int type) { uint32_t totlen; uint32_t x; uint32_t nframes; if (type == USBPF_XFERTAP_SUBMIT) nframes = xfer->nframes; else nframes = xfer->aframes; totlen = USBPF_HDR_LEN + (USBPF_FRAME_HDR_LEN * nframes); /* precompute all trace lengths */ for (x = 0; x != nframes; x++) { if (usbpf_xfer_frame_is_read(xfer, x)) { if (type != USBPF_XFERTAP_SUBMIT) { totlen += USBPF_FRAME_ALIGN( xfer->frlengths[x]); } } else { if (type == USBPF_XFERTAP_SUBMIT) { totlen += USBPF_FRAME_ALIGN( xfer->frlengths[x]); } } } return (totlen); } void usbpf_xfertap(struct usb_xfer *xfer, int type) { struct usb_bus *bus; struct usbpf_pkthdr *up; struct usbpf_framehdr *uf; usb_frlength_t offset; uint32_t totlen; uint32_t frame; uint32_t temp; uint32_t nframes; uint32_t x; uint8_t *buf; uint8_t *ptr; bus = xfer->xroot->bus; /* sanity checks */ - if (bus->ifp == NULL || if_getbpf(bus->ifp) == NULL) - return; - if (!bpf_peers_present(if_getbpf(bus->ifp))) + if (bus->ifp == NULL || !bpf_peers_present_if(bus->ifp)) return; totlen = usbpf_xfer_precompute_size(xfer, type); if (type == USBPF_XFERTAP_SUBMIT) nframes = xfer->nframes; else nframes = xfer->aframes; /* * XXX TODO XXX * * When BPF supports it we could pass a fragmented array of * buffers avoiding the data copy operation here. */ buf = ptr = malloc(totlen, M_TEMP, M_NOWAIT); if (buf == NULL) { device_printf(bus->parent, "usbpf: Out of memory\n"); return; } up = (struct usbpf_pkthdr *)ptr; ptr += USBPF_HDR_LEN; /* fill out header */ temp = device_get_unit(bus->bdev); up->up_totlen = htole32(totlen); up->up_busunit = htole32(temp); up->up_address = xfer->xroot->udev->device_index; if (xfer->flags_int.usb_mode == USB_MODE_DEVICE) up->up_mode = USBPF_MODE_DEVICE; else up->up_mode = USBPF_MODE_HOST; up->up_type = type; up->up_xfertype = xfer->endpoint->edesc->bmAttributes & UE_XFERTYPE; temp = usbpf_aggregate_xferflags(&xfer->flags); up->up_flags = htole32(temp); temp = usbpf_aggregate_status(&xfer->flags_int); up->up_status = htole32(temp); temp = xfer->error; up->up_error = htole32(temp); temp = xfer->interval; up->up_interval = htole32(temp); up->up_frames = htole32(nframes); temp = xfer->max_packet_size; up->up_packet_size = htole32(temp); temp = xfer->max_packet_count; up->up_packet_count = htole32(temp); temp = xfer->endpointno; up->up_endpoint = htole32(temp); up->up_speed = xfer->xroot->udev->speed; /* clear reserved area */ memset(up->up_reserved, 0, sizeof(up->up_reserved)); /* init offset and frame */ offset = 0; frame = 0; /* iterate all the USB frames and copy data, if any */ for (x = 0; x != nframes; x++) { uint32_t length; int isread; /* get length */ length = xfer->frlengths[x]; /* get frame header pointer */ uf = (struct usbpf_framehdr *)ptr; ptr += USBPF_FRAME_HDR_LEN; /* fill out packet header */ uf->length = htole32(length); uf->flags = 0; /* get information about data read/write */ isread = usbpf_xfer_frame_is_read(xfer, x); /* check if we need to copy any data */ if (isread) { if (type == USBPF_XFERTAP_SUBMIT) length = 0; else { uf->flags |= htole32( USBPF_FRAMEFLAG_DATA_FOLLOWS); } } else { if (type != USBPF_XFERTAP_SUBMIT) length = 0; else { uf->flags |= htole32( USBPF_FRAMEFLAG_DATA_FOLLOWS); } } /* check if data is read direction */ if (isread) uf->flags |= htole32(USBPF_FRAMEFLAG_READ); /* copy USB data, if any */ if (length != 0) { /* copy data */ usbd_copy_out(&xfer->frbuffers[frame], offset, ptr, length); /* align length */ temp = USBPF_FRAME_ALIGN(length); /* zero pad */ if (temp != length) memset(ptr + length, 0, temp - length); ptr += temp; } if (xfer->flags_int.isochronous_xfr) { offset += usbd_xfer_old_frame_length(xfer, x); } else { frame ++; } } bpf_tap_if(bus->ifp, buf, totlen); free(buf, M_TEMP); } diff --git a/sys/net/bpf.c b/sys/net/bpf.c index 8ca6e941e646..96420b709911 100644 --- a/sys/net/bpf.c +++ b/sys/net/bpf.c @@ -1,3209 +1,3223 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 2019 Andrey V. Elsukov * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence * Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)bpf.c 8.4 (Berkeley) 1/9/95 */ #include #include "opt_bpf.h" #include "opt_ddb.h" #include "opt_netgraph.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #ifdef BPF_JITTER #include #endif #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); static struct bpf_if_ext dead_bpf_if = { .bif_dlist = CK_LIST_HEAD_INITIALIZER() }; struct bpf_if { #define bif_next bif_ext.bif_next #define bif_dlist bif_ext.bif_dlist struct bpf_if_ext bif_ext; /* public members */ u_int bif_dlt; /* link layer type */ u_int bif_hdrlen; /* length of link header */ struct bpfd_list bif_wlist; /* writer-only list */ struct ifnet *bif_ifp; /* corresponding interface */ struct bpf_if **bif_bpf; /* Pointer to pointer to us */ volatile u_int bif_refcnt; struct epoch_context epoch_ctx; }; CTASSERT(offsetof(struct bpf_if, bif_ext) == 0); struct bpf_program_buffer { struct epoch_context epoch_ctx; #ifdef BPF_JITTER bpf_jit_filter *func; #endif void *buffer[0]; }; #if defined(DEV_BPF) || defined(NETGRAPH_BPF) #define PRINET 26 /* interruptible */ #define BPF_PRIO_MAX 7 #define SIZEOF_BPF_HDR(type) \ (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen)) #ifdef COMPAT_FREEBSD32 #include #include #define BPF_ALIGNMENT32 sizeof(int32_t) #define BPF_WORDALIGN32(x) roundup2(x, BPF_ALIGNMENT32) #ifndef BURN_BRIDGES /* * 32-bit version of structure prepended to each packet. We use this header * instead of the standard one for 32-bit streams. We mark the a stream as * 32-bit the first time we see a 32-bit compat ioctl request. */ struct bpf_hdr32 { struct timeval32 bh_tstamp; /* time stamp */ uint32_t bh_caplen; /* length of captured portion */ uint32_t bh_datalen; /* original length of packet */ uint16_t bh_hdrlen; /* length of bpf header (this struct plus alignment padding) */ }; #endif struct bpf_program32 { u_int bf_len; uint32_t bf_insns; }; struct bpf_dltlist32 { u_int bfl_len; u_int bfl_list; }; #define BIOCSETF32 _IOW('B', 103, struct bpf_program32) #define BIOCSRTIMEOUT32 _IOW('B', 109, struct timeval32) #define BIOCGRTIMEOUT32 _IOR('B', 110, struct timeval32) #define BIOCGDLTLIST32 _IOWR('B', 121, struct bpf_dltlist32) #define BIOCSETWF32 _IOW('B', 123, struct bpf_program32) #define BIOCSETFNR32 _IOW('B', 130, struct bpf_program32) #endif #define BPF_LOCK() sx_xlock(&bpf_sx) #define BPF_UNLOCK() sx_xunlock(&bpf_sx) #define BPF_LOCK_ASSERT() sx_assert(&bpf_sx, SA_XLOCKED) /* * bpf_iflist is a list of BPF interface structures, each corresponding to a * specific DLT. The same network interface might have several BPF interface * structures registered by different layers in the stack (i.e., 802.11 * frames, ethernet frames, etc). */ CK_LIST_HEAD(bpf_iflist, bpf_if); static struct bpf_iflist bpf_iflist; static struct sx bpf_sx; /* bpf global lock */ static int bpf_bpfd_cnt; static void bpfif_ref(struct bpf_if *); static void bpfif_rele(struct bpf_if *); static void bpfd_ref(struct bpf_d *); static void bpfd_rele(struct bpf_d *); static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); static void bpf_detachd_locked(struct bpf_d *, bool); static void bpfd_free(epoch_context_t); static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, struct sockaddr *, int *, struct bpf_d *); static int bpf_setif(struct bpf_d *, struct ifreq *); static void bpf_timed_out(void *); static __inline void bpf_wakeup(struct bpf_d *); static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct bintime *); static void reset_d(struct bpf_d *); static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); static int bpf_setdlt(struct bpf_d *, u_int); static void filt_bpfdetach(struct knote *); static int filt_bpfread(struct knote *, long); static int filt_bpfwrite(struct knote *, long); static void bpf_drvinit(void *); static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "bpf sysctl"); int bpf_maxinsns = BPF_MAXINSNS; SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW, &bpf_maxinsns, 0, "Maximum bpf program instructions"); static int bpf_zerocopy_enable = 0; SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW, &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions"); static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW, bpf_stats_sysctl, "bpf statistics portal"); VNET_DEFINE_STATIC(int, bpf_optimize_writers) = 0; #define V_bpf_optimize_writers VNET(bpf_optimize_writers) SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RWTUN, &VNET_NAME(bpf_optimize_writers), 0, "Do not send packets until BPF program is set"); static d_open_t bpfopen; static d_read_t bpfread; static d_write_t bpfwrite; static d_ioctl_t bpfioctl; static d_poll_t bpfpoll; static d_kqfilter_t bpfkqfilter; static struct cdevsw bpf_cdevsw = { .d_version = D_VERSION, .d_open = bpfopen, .d_read = bpfread, .d_write = bpfwrite, .d_ioctl = bpfioctl, .d_poll = bpfpoll, .d_name = "bpf", .d_kqfilter = bpfkqfilter, }; static struct filterops bpfread_filtops = { .f_isfd = 1, .f_detach = filt_bpfdetach, .f_event = filt_bpfread, }; static struct filterops bpfwrite_filtops = { .f_isfd = 1, .f_detach = filt_bpfdetach, .f_event = filt_bpfwrite, }; /* * LOCKING MODEL USED BY BPF * * Locks: * 1) global lock (BPF_LOCK). Sx, used to protect some global counters, * every bpf_iflist changes, serializes ioctl access to bpf descriptors. * 2) Descriptor lock. Mutex, used to protect BPF buffers and various * structure fields used by bpf_*tap* code. * * Lock order: global lock, then descriptor lock. * * There are several possible consumers: * * 1. The kernel registers interface pointer with bpfattach(). * Each call allocates new bpf_if structure, references ifnet pointer * and links bpf_if into bpf_iflist chain. This is protected with global * lock. * * 2. An userland application uses ioctl() call to bpf_d descriptor. * All such call are serialized with global lock. BPF filters can be * changed, but pointer to old filter will be freed using NET_EPOCH_CALL(). * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to * filter pointers, even if change will happen during bpf_tap execution. * Destroying of bpf_d descriptor also is doing using NET_EPOCH_CALL(). * * 3. An userland application can write packets into bpf_d descriptor. * There we need to be sure, that ifnet won't disappear during bpfwrite(). * * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to * bif_dlist is protected with net_epoch_preempt section. So, it should * be safe to make access to bpf_d descriptor inside the section. * * 5. The kernel invokes bpfdetach() on interface destroying. All lists * are modified with global lock held and actual free() is done using * NET_EPOCH_CALL(). */ static void bpfif_free(epoch_context_t ctx) { struct bpf_if *bp; bp = __containerof(ctx, struct bpf_if, epoch_ctx); if_rele(bp->bif_ifp); free(bp, M_BPF); } static void bpfif_ref(struct bpf_if *bp) { refcount_acquire(&bp->bif_refcnt); } static void bpfif_rele(struct bpf_if *bp) { if (!refcount_release(&bp->bif_refcnt)) return; NET_EPOCH_CALL(bpfif_free, &bp->epoch_ctx); } static void bpfd_ref(struct bpf_d *d) { refcount_acquire(&d->bd_refcnt); } static void bpfd_rele(struct bpf_d *d) { if (!refcount_release(&d->bd_refcnt)) return; NET_EPOCH_CALL(bpfd_free, &d->epoch_ctx); } static struct bpf_program_buffer* bpf_program_buffer_alloc(size_t size, int flags) { return (malloc(sizeof(struct bpf_program_buffer) + size, M_BPF, flags)); } static void bpf_program_buffer_free(epoch_context_t ctx) { struct bpf_program_buffer *ptr; ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx); #ifdef BPF_JITTER if (ptr->func != NULL) bpf_destroy_jit_filter(ptr->func); #endif free(ptr, M_BPF); } /* * Wrapper functions for various buffering methods. If the set of buffer * modes expands, we will probably want to introduce a switch data structure * similar to protosw, et. */ static void bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_append_bytes(d, buf, offset, src, len)); case BPF_BUFMODE_ZBUF: counter_u64_add(d->bd_zcopy, 1); return (bpf_zerocopy_append_bytes(d, buf, offset, src, len)); default: panic("bpf_buf_append_bytes"); } } static void bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_append_mbuf(d, buf, offset, src, len)); case BPF_BUFMODE_ZBUF: counter_u64_add(d->bd_zcopy, 1); return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len)); default: panic("bpf_buf_append_mbuf"); } } /* * This function gets called when the free buffer is re-assigned. */ static void bpf_buf_reclaimed(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return; case BPF_BUFMODE_ZBUF: bpf_zerocopy_buf_reclaimed(d); return; default: panic("bpf_buf_reclaimed"); } } /* * If the buffer mechanism has a way to decide that a held buffer can be made * free, then it is exposed via the bpf_canfreebuf() interface. (1) is * returned if the buffer can be discarded, (0) is returned if it cannot. */ static int bpf_canfreebuf(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_canfreebuf(d)); } return (0); } /* * Allow the buffer model to indicate that the current store buffer is * immutable, regardless of the appearance of space. Return (1) if the * buffer is writable, and (0) if not. */ static int bpf_canwritebuf(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_canwritebuf(d)); } return (1); } /* * Notify buffer model that an attempt to write to the store buffer has * resulted in a dropped packet, in which case the buffer may be considered * full. */ static void bpf_buffull(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: bpf_zerocopy_buffull(d); break; } } /* * Notify the buffer model that a buffer has moved into the hold position. */ void bpf_bufheld(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: bpf_zerocopy_bufheld(d); break; } } static void bpf_free(struct bpf_d *d) { switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_free(d)); case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_free(d)); default: panic("bpf_buf_free"); } } static int bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio) { if (d->bd_bufmode != BPF_BUFMODE_BUFFER) return (EOPNOTSUPP); return (bpf_buffer_uiomove(d, buf, len, uio)); } static int bpf_ioctl_sblen(struct bpf_d *d, u_int *i) { if (d->bd_bufmode != BPF_BUFMODE_BUFFER) return (EOPNOTSUPP); return (bpf_buffer_ioctl_sblen(d, i)); } static int bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_getzmax(td, d, i)); } static int bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz)); } static int bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_setzbuf(td, d, bz)); } /* * General BPF functions. */ static int bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, struct sockaddr *sockp, int *hdrlen, struct bpf_d *d) { const struct ieee80211_bpf_params *p; struct ether_header *eh; struct mbuf *m; int error; int len; int hlen; int slen; /* * Build a sockaddr based on the data link layer type. * We do this at this level because the ethernet header * is copied directly into the data field of the sockaddr. * In the case of SLIP, there is no header and the packet * is forwarded as is. * Also, we are careful to leave room at the front of the mbuf * for the link level header. */ switch (linktype) { case DLT_SLIP: sockp->sa_family = AF_INET; hlen = 0; break; case DLT_EN10MB: sockp->sa_family = AF_UNSPEC; /* XXX Would MAXLINKHDR be better? */ hlen = ETHER_HDR_LEN; break; case DLT_FDDI: sockp->sa_family = AF_IMPLINK; hlen = 0; break; case DLT_RAW: sockp->sa_family = AF_UNSPEC; hlen = 0; break; case DLT_NULL: /* * null interface types require a 4 byte pseudo header which * corresponds to the address family of the packet. */ sockp->sa_family = AF_UNSPEC; hlen = 4; break; case DLT_ATM_RFC1483: /* * en atm driver requires 4-byte atm pseudo header. * though it isn't standard, vpi:vci needs to be * specified anyway. */ sockp->sa_family = AF_UNSPEC; hlen = 12; /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */ break; case DLT_PPP: sockp->sa_family = AF_UNSPEC; hlen = 4; /* This should match PPP_HDRLEN */ break; case DLT_IEEE802_11: /* IEEE 802.11 wireless */ sockp->sa_family = AF_IEEE80211; hlen = 0; break; case DLT_IEEE802_11_RADIO: /* IEEE 802.11 wireless w/ phy params */ sockp->sa_family = AF_IEEE80211; sockp->sa_len = 12; /* XXX != 0 */ hlen = sizeof(struct ieee80211_bpf_params); break; default: return (EIO); } len = uio->uio_resid; if (len < hlen || len - hlen > ifp->if_mtu) return (EMSGSIZE); /* Allocate a mbuf for our write, since m_get2 fails if len >= to MJUMPAGESIZE, use m_getjcl for bigger buffers */ m = m_get3(len, M_WAITOK, MT_DATA, M_PKTHDR); if (m == NULL) return (EIO); m->m_pkthdr.len = m->m_len = len; *mp = m; error = uiomove(mtod(m, u_char *), len, uio); if (error) goto bad; slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len); if (slen == 0) { error = EPERM; goto bad; } /* Check for multicast destination */ switch (linktype) { case DLT_EN10MB: eh = mtod(m, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost, ETHER_ADDR_LEN) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; } if (d->bd_hdrcmplt == 0) { memcpy(eh->ether_shost, IF_LLADDR(ifp), sizeof(eh->ether_shost)); } break; } /* * Make room for link header, and copy it to sockaddr */ if (hlen != 0) { if (sockp->sa_family == AF_IEEE80211) { /* * Collect true length from the parameter header * NB: sockp is known to be zero'd so if we do a * short copy unspecified parameters will be * zero. * NB: packet may not be aligned after stripping * bpf params * XXX check ibp_vers */ p = mtod(m, const struct ieee80211_bpf_params *); hlen = p->ibp_len; if (hlen > sizeof(sockp->sa_data)) { error = EINVAL; goto bad; } } bcopy(mtod(m, const void *), sockp->sa_data, hlen); } *hdrlen = hlen; return (0); bad: m_freem(m); return (error); } /* * Attach descriptor to the bpf interface, i.e. make d listen on bp, * then reset its buffers and counters with reset_d(). */ static void bpf_attachd(struct bpf_d *d, struct bpf_if *bp) { int op_w; BPF_LOCK_ASSERT(); /* * Save sysctl value to protect from sysctl change * between reads */ op_w = V_bpf_optimize_writers || d->bd_writer; if (d->bd_bif != NULL) bpf_detachd_locked(d, false); /* * Point d at bp, and add d to the interface's list. * Since there are many applications using BPF for * sending raw packets only (dhcpd, cdpd are good examples) * we can delay adding d to the list of active listeners until * some filter is configured. */ BPFD_LOCK(d); /* * Hold reference to bpif while descriptor uses this interface. */ bpfif_ref(bp); d->bd_bif = bp; if (op_w != 0) { /* Add to writers-only list */ CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next); /* * We decrement bd_writer on every filter set operation. * First BIOCSETF is done by pcap_open_live() to set up * snap length. After that appliation usually sets its own * filter. */ d->bd_writer = 2; } else CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); reset_d(d); /* Trigger EVFILT_WRITE events. */ bpf_wakeup(d); BPFD_UNLOCK(d); bpf_bpfd_cnt++; CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list", __func__, d->bd_pid, d->bd_writer ? "writer" : "active"); if (op_w == 0) EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1); } /* * Check if we need to upgrade our descriptor @d from write-only mode. */ static int bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen) { int is_snap, need_upgrade; /* * Check if we've already upgraded or new filter is empty. */ if (d->bd_writer == 0 || fcode == NULL) return (0); need_upgrade = 0; /* * Check if cmd looks like snaplen setting from * pcap_bpf.c:pcap_open_live(). * Note we're not checking .k value here: * while pcap_open_live() definitely sets to non-zero value, * we'd prefer to treat k=0 (deny ALL) case the same way: e.g. * do not consider upgrading immediately */ if (cmd == BIOCSETF && flen == 1 && fcode[0].code == (BPF_RET | BPF_K)) is_snap = 1; else is_snap = 0; if (is_snap == 0) { /* * We're setting first filter and it doesn't look like * setting snaplen. We're probably using bpf directly. * Upgrade immediately. */ need_upgrade = 1; } else { /* * Do not require upgrade by first BIOCSETF * (used to set snaplen) by pcap_open_live(). */ if (--d->bd_writer == 0) { /* * First snaplen filter has already * been set. This is probably catch-all * filter */ need_upgrade = 1; } } CTR5(KTR_NET, "%s: filter function set by pid %d, " "bd_writer counter %d, snap %d upgrade %d", __func__, d->bd_pid, d->bd_writer, is_snap, need_upgrade); return (need_upgrade); } /* * Detach a file from its interface. */ static void bpf_detachd(struct bpf_d *d) { BPF_LOCK(); bpf_detachd_locked(d, false); BPF_UNLOCK(); } static void bpf_detachd_locked(struct bpf_d *d, bool detached_ifp) { struct bpf_if *bp; struct ifnet *ifp; int error; BPF_LOCK_ASSERT(); CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid); /* Check if descriptor is attached */ if ((bp = d->bd_bif) == NULL) return; BPFD_LOCK(d); /* Remove d from the interface's descriptor list. */ CK_LIST_REMOVE(d, bd_next); /* Save bd_writer value */ error = d->bd_writer; ifp = bp->bif_ifp; d->bd_bif = NULL; if (detached_ifp) { /* * Notify descriptor as it's detached, so that any * sleepers wake up and get ENXIO. */ bpf_wakeup(d); } BPFD_UNLOCK(d); bpf_bpfd_cnt--; /* Call event handler iff d is attached */ if (error == 0) EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0); /* * Check if this descriptor had requested promiscuous mode. * If so and ifnet is not detached, turn it off. */ if (d->bd_promisc && !detached_ifp) { d->bd_promisc = 0; CURVNET_SET(ifp->if_vnet); error = ifpromisc(ifp, 0); CURVNET_RESTORE(); if (error != 0 && error != ENXIO) { /* * ENXIO can happen if a pccard is unplugged * Something is really wrong if we were able to put * the driver into promiscuous mode, but can't * take it out. */ if_printf(bp->bif_ifp, "bpf_detach: ifpromisc failed (%d)\n", error); } } bpfif_rele(bp); } /* * Close the descriptor by detaching it from its interface, * deallocating its buffers, and marking it free. */ static void bpf_dtor(void *data) { struct bpf_d *d = data; BPFD_LOCK(d); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); funsetown(&d->bd_sigio); bpf_detachd(d); #ifdef MAC mac_bpfdesc_destroy(d); #endif /* MAC */ seldrain(&d->bd_sel); knlist_destroy(&d->bd_sel.si_note); callout_drain(&d->bd_callout); bpfd_rele(d); } /* * Open ethernet device. Returns ENXIO for illegal minor device number, * EBUSY if file is open by another process. */ /* ARGSUSED */ static int bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) { struct bpf_d *d; int error; d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO); error = devfs_set_cdevpriv(d, bpf_dtor); if (error != 0) { free(d, M_BPF); return (error); } /* Setup counters */ d->bd_rcount = counter_u64_alloc(M_WAITOK); d->bd_dcount = counter_u64_alloc(M_WAITOK); d->bd_fcount = counter_u64_alloc(M_WAITOK); d->bd_wcount = counter_u64_alloc(M_WAITOK); d->bd_wfcount = counter_u64_alloc(M_WAITOK); d->bd_wdcount = counter_u64_alloc(M_WAITOK); d->bd_zcopy = counter_u64_alloc(M_WAITOK); /* * For historical reasons, perform a one-time initialization call to * the buffer routines, even though we're not yet committed to a * particular buffer method. */ bpf_buffer_init(d); if ((flags & FREAD) == 0) d->bd_writer = 2; d->bd_hbuf_in_use = 0; d->bd_bufmode = BPF_BUFMODE_BUFFER; d->bd_sig = SIGIO; d->bd_direction = BPF_D_INOUT; refcount_init(&d->bd_refcnt, 1); BPF_PID_REFRESH(d, td); #ifdef MAC mac_bpfdesc_init(d); mac_bpfdesc_create(td->td_ucred, d); #endif mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF); callout_init_mtx(&d->bd_callout, &d->bd_lock, 0); knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock); /* Disable VLAN pcp tagging. */ d->bd_pcp = 0; return (0); } /* * bpfread - read next chunk of packets from buffers */ static int bpfread(struct cdev *dev, struct uio *uio, int ioflag) { struct bpf_d *d; int error; int non_block; int timed_out; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); /* * Restrict application to use a buffer the same size as * as kernel buffers. */ if (uio->uio_resid != d->bd_bufsize) return (EINVAL); non_block = ((ioflag & O_NONBLOCK) != 0); BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); if (d->bd_bufmode != BPF_BUFMODE_BUFFER) { BPFD_UNLOCK(d); return (EOPNOTSUPP); } if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); timed_out = (d->bd_state == BPF_TIMED_OUT); d->bd_state = BPF_IDLE; while (d->bd_hbuf_in_use) { error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET|PCATCH, "bd_hbuf", 0); if (error != 0) { BPFD_UNLOCK(d); return (error); } } /* * If the hold buffer is empty, then do a timed sleep, which * ends when the timeout expires or when enough packets * have arrived to fill the store buffer. */ while (d->bd_hbuf == NULL) { if (d->bd_slen != 0) { /* * A packet(s) either arrived since the previous * read or arrived while we were asleep. */ if (d->bd_immediate || non_block || timed_out) { /* * Rotate the buffers and return what's here * if we are in immediate mode, non-blocking * flag is set, or this descriptor timed out. */ ROTATE_BUFFERS(d); break; } } /* * No data is available, check to see if the bpf device * is still pointed at a real interface. If not, return * ENXIO so that the userland process knows to rebind * it before using it again. */ if (d->bd_bif == NULL) { BPFD_UNLOCK(d); return (ENXIO); } if (non_block) { BPFD_UNLOCK(d); return (EWOULDBLOCK); } error = msleep(d, &d->bd_lock, PRINET|PCATCH, "bpf", d->bd_rtout); if (error == EINTR || error == ERESTART) { BPFD_UNLOCK(d); return (error); } if (error == EWOULDBLOCK) { /* * On a timeout, return what's in the buffer, * which may be nothing. If there is something * in the store buffer, we can rotate the buffers. */ if (d->bd_hbuf) /* * We filled up the buffer in between * getting the timeout and arriving * here, so we don't need to rotate. */ break; if (d->bd_slen == 0) { BPFD_UNLOCK(d); return (0); } ROTATE_BUFFERS(d); break; } } /* * At this point, we know we have something in the hold slot. */ d->bd_hbuf_in_use = 1; BPFD_UNLOCK(d); /* * Move data from hold buffer into user space. * We know the entire buffer is transferred since * we checked above that the read buffer is bpf_bufsize bytes. * * We do not have to worry about simultaneous reads because * we waited for sole access to the hold buffer above. */ error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio); BPFD_LOCK(d); KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf")); d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); d->bd_hbuf_in_use = 0; wakeup(&d->bd_hbuf_in_use); BPFD_UNLOCK(d); return (error); } /* * If there are processes sleeping on this descriptor, wake them up. */ static __inline void bpf_wakeup(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); if (d->bd_state == BPF_WAITING) { callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; } wakeup(d); if (d->bd_async && d->bd_sig && d->bd_sigio) pgsigio(&d->bd_sigio, d->bd_sig, 0); selwakeuppri(&d->bd_sel, PRINET); KNOTE_LOCKED(&d->bd_sel.si_note, 0); } static void bpf_timed_out(void *arg) { struct bpf_d *d = (struct bpf_d *)arg; BPFD_LOCK_ASSERT(d); if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout)) return; if (d->bd_state == BPF_WAITING) { d->bd_state = BPF_TIMED_OUT; if (d->bd_slen != 0) bpf_wakeup(d); } } static int bpf_ready(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); if (!bpf_canfreebuf(d) && d->bd_hlen != 0) return (1); if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && d->bd_slen != 0) return (1); return (0); } static int bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) { struct route ro; struct sockaddr dst; struct epoch_tracker et; struct bpf_if *bp; struct bpf_d *d; struct ifnet *ifp; struct mbuf *m, *mc; int error, hlen; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); NET_EPOCH_ENTER(et); BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); counter_u64_add(d->bd_wcount, 1); if ((bp = d->bd_bif) == NULL) { error = ENXIO; goto out_locked; } ifp = bp->bif_ifp; if ((ifp->if_flags & IFF_UP) == 0) { error = ENETDOWN; goto out_locked; } if (uio->uio_resid == 0) goto out_locked; bzero(&dst, sizeof(dst)); m = NULL; hlen = 0; /* * Take extra reference, unlock d and exit from epoch section, * since bpf_movein() can sleep. */ bpfd_ref(d); NET_EPOCH_EXIT(et); BPFD_UNLOCK(d); error = bpf_movein(uio, (int)bp->bif_dlt, ifp, &m, &dst, &hlen, d); if (error != 0) { counter_u64_add(d->bd_wdcount, 1); bpfd_rele(d); return (error); } BPFD_LOCK(d); /* * Check that descriptor is still attached to the interface. * This can happen on bpfdetach(). To avoid access to detached * ifnet, free mbuf and return ENXIO. */ if (d->bd_bif == NULL) { counter_u64_add(d->bd_wdcount, 1); BPFD_UNLOCK(d); bpfd_rele(d); m_freem(m); return (ENXIO); } counter_u64_add(d->bd_wfcount, 1); if (d->bd_hdrcmplt) dst.sa_family = pseudo_AF_HDRCMPLT; if (d->bd_feedback) { mc = m_dup(m, M_NOWAIT); if (mc != NULL) mc->m_pkthdr.rcvif = ifp; /* Set M_PROMISC for outgoing packets to be discarded. */ if (d->bd_direction == BPF_D_INOUT) m->m_flags |= M_PROMISC; } else mc = NULL; m->m_pkthdr.len -= hlen; m->m_len -= hlen; m->m_data += hlen; /* XXX */ CURVNET_SET(ifp->if_vnet); #ifdef MAC mac_bpfdesc_create_mbuf(d, m); if (mc != NULL) mac_bpfdesc_create_mbuf(d, mc); #endif bzero(&ro, sizeof(ro)); if (hlen != 0) { ro.ro_prepend = (u_char *)&dst.sa_data; ro.ro_plen = hlen; ro.ro_flags = RT_HAS_HEADER; } if (d->bd_pcp != 0) vlan_set_pcp(m, d->bd_pcp); /* Avoid possible recursion on BPFD_LOCK(). */ NET_EPOCH_ENTER(et); BPFD_UNLOCK(d); error = (*ifp->if_output)(ifp, m, &dst, &ro); if (error) counter_u64_add(d->bd_wdcount, 1); if (mc != NULL) { if (error == 0) (*ifp->if_input)(ifp, mc); else m_freem(mc); } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); bpfd_rele(d); return (error); out_locked: counter_u64_add(d->bd_wdcount, 1); NET_EPOCH_EXIT(et); BPFD_UNLOCK(d); return (error); } /* * Reset a descriptor by flushing its packet buffer and clearing the receive * and drop counts. This is doable for kernel-only buffers, but with * zero-copy buffers, we can't write to (or rotate) buffers that are * currently owned by userspace. It would be nice if we could encapsulate * this logic in the buffer code rather than here. */ static void reset_d(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); while (d->bd_hbuf_in_use) mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET, "bd_hbuf", 0); if ((d->bd_hbuf != NULL) && (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) { /* Free the hold buffer. */ d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); } if (bpf_canwritebuf(d)) d->bd_slen = 0; counter_u64_zero(d->bd_rcount); counter_u64_zero(d->bd_dcount); counter_u64_zero(d->bd_fcount); counter_u64_zero(d->bd_wcount); counter_u64_zero(d->bd_wfcount); counter_u64_zero(d->bd_wdcount); counter_u64_zero(d->bd_zcopy); } /* * FIONREAD Check for read packet available. * BIOCGBLEN Get buffer len [for read()]. * BIOCSETF Set read filter. * BIOCSETFNR Set read filter without resetting descriptor. * BIOCSETWF Set write filter. * BIOCFLUSH Flush read packet buffer. * BIOCPROMISC Put interface into promiscuous mode. * BIOCGDLT Get link layer type. * BIOCGETIF Get interface name. * BIOCSETIF Set interface. * BIOCSRTIMEOUT Set read timeout. * BIOCGRTIMEOUT Get read timeout. * BIOCGSTATS Get packet stats. * BIOCIMMEDIATE Set immediate mode. * BIOCVERSION Get filter language version. * BIOCGHDRCMPLT Get "header already complete" flag * BIOCSHDRCMPLT Set "header already complete" flag * BIOCGDIRECTION Get packet direction flag * BIOCSDIRECTION Set packet direction flag * BIOCGTSTAMP Get time stamp format and resolution. * BIOCSTSTAMP Set time stamp format and resolution. * BIOCLOCK Set "locked" flag * BIOCFEEDBACK Set packet feedback mode. * BIOCSETZBUF Set current zero-copy buffer locations. * BIOCGETZMAX Get maximum zero-copy buffer size. * BIOCROTZBUF Force rotation of zero-copy buffer * BIOCSETBUFMODE Set buffer mode. * BIOCGETBUFMODE Get current buffer mode. * BIOCSETVLANPCP Set VLAN PCP tag. */ /* ARGSUSED */ static int bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { struct bpf_d *d; int error; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); /* * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); BPF_PID_REFRESH(d, td); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); if (d->bd_locked == 1) { switch (cmd) { case BIOCGBLEN: case BIOCFLUSH: case BIOCGDLT: case BIOCGDLTLIST: #ifdef COMPAT_FREEBSD32 case BIOCGDLTLIST32: #endif case BIOCGETIF: case BIOCGRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCGRTIMEOUT32: #endif case BIOCGSTATS: case BIOCVERSION: case BIOCGRSIG: case BIOCGHDRCMPLT: case BIOCSTSTAMP: case BIOCFEEDBACK: case FIONREAD: case BIOCLOCK: case BIOCSRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCSRTIMEOUT32: #endif case BIOCIMMEDIATE: case TIOCGPGRP: case BIOCROTZBUF: break; default: return (EPERM); } } #ifdef COMPAT_FREEBSD32 /* * If we see a 32-bit compat ioctl, mark the stream as 32-bit so * that it will get 32-bit packet headers. */ switch (cmd) { case BIOCSETF32: case BIOCSETFNR32: case BIOCSETWF32: case BIOCGDLTLIST32: case BIOCGRTIMEOUT32: case BIOCSRTIMEOUT32: if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { BPFD_LOCK(d); d->bd_compat32 = 1; BPFD_UNLOCK(d); } } #endif CURVNET_SET(TD_TO_VNET(td)); switch (cmd) { default: error = EINVAL; break; /* * Check for read packet available. */ case FIONREAD: { int n; BPFD_LOCK(d); n = d->bd_slen; while (d->bd_hbuf_in_use) mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET, "bd_hbuf", 0); if (d->bd_hbuf) n += d->bd_hlen; BPFD_UNLOCK(d); *(int *)addr = n; break; } /* * Get buffer len [for read()]. */ case BIOCGBLEN: BPFD_LOCK(d); *(u_int *)addr = d->bd_bufsize; BPFD_UNLOCK(d); break; /* * Set buffer length. */ case BIOCSBLEN: error = bpf_ioctl_sblen(d, (u_int *)addr); break; /* * Set link layer read filter. */ case BIOCSETF: case BIOCSETFNR: case BIOCSETWF: #ifdef COMPAT_FREEBSD32 case BIOCSETF32: case BIOCSETFNR32: case BIOCSETWF32: #endif error = bpf_setf(d, (struct bpf_program *)addr, cmd); break; /* * Flush read packet buffer. */ case BIOCFLUSH: BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); break; /* * Put interface into promiscuous mode. */ case BIOCPROMISC: BPF_LOCK(); if (d->bd_bif == NULL) { /* * No interface attached yet. */ error = EINVAL; } else if (d->bd_promisc == 0) { error = ifpromisc(d->bd_bif->bif_ifp, 1); if (error == 0) d->bd_promisc = 1; } BPF_UNLOCK(); break; /* * Get current data link type. */ case BIOCGDLT: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else *(u_int *)addr = d->bd_bif->bif_dlt; BPF_UNLOCK(); break; /* * Get a list of supported data link types. */ #ifdef COMPAT_FREEBSD32 case BIOCGDLTLIST32: { struct bpf_dltlist32 *list32; struct bpf_dltlist dltlist; list32 = (struct bpf_dltlist32 *)addr; dltlist.bfl_len = list32->bfl_len; dltlist.bfl_list = PTRIN(list32->bfl_list); BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else { error = bpf_getdltlist(d, &dltlist); if (error == 0) list32->bfl_len = dltlist.bfl_len; } BPF_UNLOCK(); break; } #endif case BIOCGDLTLIST: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_getdltlist(d, (struct bpf_dltlist *)addr); BPF_UNLOCK(); break; /* * Set data link type. */ case BIOCSDLT: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_setdlt(d, *(u_int *)addr); BPF_UNLOCK(); break; /* * Get interface name. */ case BIOCGETIF: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else { struct ifnet *const ifp = d->bd_bif->bif_ifp; struct ifreq *const ifr = (struct ifreq *)addr; strlcpy(ifr->ifr_name, ifp->if_xname, sizeof(ifr->ifr_name)); } BPF_UNLOCK(); break; /* * Set interface. */ case BIOCSETIF: { int alloc_buf, size; /* * Behavior here depends on the buffering model. If * we're using kernel memory buffers, then we can * allocate them here. If we're using zero-copy, * then the user process must have registered buffers * by the time we get here. */ alloc_buf = 0; BPFD_LOCK(d); if (d->bd_bufmode == BPF_BUFMODE_BUFFER && d->bd_sbuf == NULL) alloc_buf = 1; BPFD_UNLOCK(d); if (alloc_buf) { size = d->bd_bufsize; error = bpf_buffer_ioctl_sblen(d, &size); if (error != 0) break; } BPF_LOCK(); error = bpf_setif(d, (struct ifreq *)addr); BPF_UNLOCK(); break; } /* * Set read timeout. */ case BIOCSRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCSRTIMEOUT32: #endif { struct timeval *tv = (struct timeval *)addr; #if defined(COMPAT_FREEBSD32) struct timeval32 *tv32; struct timeval tv64; if (cmd == BIOCSRTIMEOUT32) { tv32 = (struct timeval32 *)addr; tv = &tv64; tv->tv_sec = tv32->tv_sec; tv->tv_usec = tv32->tv_usec; } else #endif tv = (struct timeval *)addr; /* * Subtract 1 tick from tvtohz() since this isn't * a one-shot timer. */ if ((error = itimerfix(tv)) == 0) d->bd_rtout = tvtohz(tv) - 1; break; } /* * Get read timeout. */ case BIOCGRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCGRTIMEOUT32: #endif { struct timeval *tv; #if defined(COMPAT_FREEBSD32) && defined(__amd64__) struct timeval32 *tv32; struct timeval tv64; if (cmd == BIOCGRTIMEOUT32) tv = &tv64; else #endif tv = (struct timeval *)addr; tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; #if defined(COMPAT_FREEBSD32) && defined(__amd64__) if (cmd == BIOCGRTIMEOUT32) { tv32 = (struct timeval32 *)addr; tv32->tv_sec = tv->tv_sec; tv32->tv_usec = tv->tv_usec; } #endif break; } /* * Get packet stats. */ case BIOCGSTATS: { struct bpf_stat *bs = (struct bpf_stat *)addr; /* XXXCSJP overflow */ bs->bs_recv = (u_int)counter_u64_fetch(d->bd_rcount); bs->bs_drop = (u_int)counter_u64_fetch(d->bd_dcount); break; } /* * Set immediate mode. */ case BIOCIMMEDIATE: BPFD_LOCK(d); d->bd_immediate = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCVERSION: { struct bpf_version *bv = (struct bpf_version *)addr; bv->bv_major = BPF_MAJOR_VERSION; bv->bv_minor = BPF_MINOR_VERSION; break; } /* * Get "header already complete" flag */ case BIOCGHDRCMPLT: BPFD_LOCK(d); *(u_int *)addr = d->bd_hdrcmplt; BPFD_UNLOCK(d); break; /* * Set "header already complete" flag */ case BIOCSHDRCMPLT: BPFD_LOCK(d); d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0; BPFD_UNLOCK(d); break; /* * Get packet direction flag */ case BIOCGDIRECTION: BPFD_LOCK(d); *(u_int *)addr = d->bd_direction; BPFD_UNLOCK(d); break; /* * Set packet direction flag */ case BIOCSDIRECTION: { u_int direction; direction = *(u_int *)addr; switch (direction) { case BPF_D_IN: case BPF_D_INOUT: case BPF_D_OUT: BPFD_LOCK(d); d->bd_direction = direction; BPFD_UNLOCK(d); break; default: error = EINVAL; } } break; /* * Get packet timestamp format and resolution. */ case BIOCGTSTAMP: BPFD_LOCK(d); *(u_int *)addr = d->bd_tstamp; BPFD_UNLOCK(d); break; /* * Set packet timestamp format and resolution. */ case BIOCSTSTAMP: { u_int func; func = *(u_int *)addr; if (BPF_T_VALID(func)) d->bd_tstamp = func; else error = EINVAL; } break; case BIOCFEEDBACK: BPFD_LOCK(d); d->bd_feedback = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCLOCK: BPFD_LOCK(d); d->bd_locked = 1; BPFD_UNLOCK(d); break; case FIONBIO: /* Non-blocking I/O */ break; case FIOASYNC: /* Send signal on receive packets */ BPFD_LOCK(d); d->bd_async = *(int *)addr; BPFD_UNLOCK(d); break; case FIOSETOWN: /* * XXX: Add some sort of locking here? * fsetown() can sleep. */ error = fsetown(*(int *)addr, &d->bd_sigio); break; case FIOGETOWN: BPFD_LOCK(d); *(int *)addr = fgetown(&d->bd_sigio); BPFD_UNLOCK(d); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: error = fsetown(-(*(int *)addr), &d->bd_sigio); break; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)addr = -fgetown(&d->bd_sigio); break; case BIOCSRSIG: /* Set receive signal */ { u_int sig; sig = *(u_int *)addr; if (sig >= NSIG) error = EINVAL; else { BPFD_LOCK(d); d->bd_sig = sig; BPFD_UNLOCK(d); } break; } case BIOCGRSIG: BPFD_LOCK(d); *(u_int *)addr = d->bd_sig; BPFD_UNLOCK(d); break; case BIOCGETBUFMODE: BPFD_LOCK(d); *(u_int *)addr = d->bd_bufmode; BPFD_UNLOCK(d); break; case BIOCSETBUFMODE: /* * Allow the buffering mode to be changed as long as we * haven't yet committed to a particular mode. Our * definition of commitment, for now, is whether or not a * buffer has been allocated or an interface attached, since * that's the point where things get tricky. */ switch (*(u_int *)addr) { case BPF_BUFMODE_BUFFER: break; case BPF_BUFMODE_ZBUF: if (bpf_zerocopy_enable) break; /* FALLSTHROUGH */ default: CURVNET_RESTORE(); return (EINVAL); } BPFD_LOCK(d); if (d->bd_sbuf != NULL || d->bd_hbuf != NULL || d->bd_fbuf != NULL || d->bd_bif != NULL) { BPFD_UNLOCK(d); CURVNET_RESTORE(); return (EBUSY); } d->bd_bufmode = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCGETZMAX: error = bpf_ioctl_getzmax(td, d, (size_t *)addr); break; case BIOCSETZBUF: error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr); break; case BIOCROTZBUF: error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr); break; case BIOCSETVLANPCP: { u_int pcp; pcp = *(u_int *)addr; if (pcp > BPF_PRIO_MAX || pcp < 0) { error = EINVAL; break; } d->bd_pcp = pcp; break; } } CURVNET_RESTORE(); return (error); } /* * Set d's packet filter program to fp. If this file already has a filter, * free it and replace it. Returns EINVAL for bogus requests. * * Note we use global lock here to serialize bpf_setf() and bpf_setif() * calls. */ static int bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) { #ifdef COMPAT_FREEBSD32 struct bpf_program fp_swab; struct bpf_program32 *fp32; #endif struct bpf_program_buffer *fcode; struct bpf_insn *filter; #ifdef BPF_JITTER bpf_jit_filter *jfunc; #endif size_t size; u_int flen; bool track_event; #ifdef COMPAT_FREEBSD32 switch (cmd) { case BIOCSETF32: case BIOCSETWF32: case BIOCSETFNR32: fp32 = (struct bpf_program32 *)fp; fp_swab.bf_len = fp32->bf_len; fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns; fp = &fp_swab; switch (cmd) { case BIOCSETF32: cmd = BIOCSETF; break; case BIOCSETWF32: cmd = BIOCSETWF; break; } break; } #endif filter = NULL; #ifdef BPF_JITTER jfunc = NULL; #endif /* * Check new filter validness before acquiring any locks. * Allocate memory for new filter, if needed. */ flen = fp->bf_len; if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0)) return (EINVAL); size = flen * sizeof(*fp->bf_insns); if (size > 0) { /* We're setting up new filter. Copy and check actual data. */ fcode = bpf_program_buffer_alloc(size, M_WAITOK); filter = (struct bpf_insn *)fcode->buffer; if (copyin(fp->bf_insns, filter, size) != 0 || !bpf_validate(filter, flen)) { free(fcode, M_BPF); return (EINVAL); } #ifdef BPF_JITTER if (cmd != BIOCSETWF) { /* * Filter is copied inside fcode and is * perfectly valid. */ jfunc = bpf_jitter(filter, flen); } #endif } track_event = false; fcode = NULL; BPF_LOCK(); BPFD_LOCK(d); /* Set up new filter. */ if (cmd == BIOCSETWF) { if (d->bd_wfilter != NULL) { fcode = __containerof((void *)d->bd_wfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER fcode->func = NULL; #endif } d->bd_wfilter = filter; } else { if (d->bd_rfilter != NULL) { fcode = __containerof((void *)d->bd_rfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER fcode->func = d->bd_bfilter; #endif } d->bd_rfilter = filter; #ifdef BPF_JITTER d->bd_bfilter = jfunc; #endif if (cmd == BIOCSETF) reset_d(d); if (bpf_check_upgrade(cmd, d, filter, flen) != 0) { /* * Filter can be set several times without * specifying interface. In this case just mark d * as reader. */ d->bd_writer = 0; if (d->bd_bif != NULL) { /* * Remove descriptor from writers-only list * and add it to active readers list. */ CK_LIST_REMOVE(d, bd_next); CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist, d, bd_next); CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid); track_event = true; } } } BPFD_UNLOCK(d); if (fcode != NULL) NET_EPOCH_CALL(bpf_program_buffer_free, &fcode->epoch_ctx); if (track_event) EVENTHANDLER_INVOKE(bpf_track, d->bd_bif->bif_ifp, d->bd_bif->bif_dlt, 1); BPF_UNLOCK(); return (0); } /* * Detach a file from its current interface (if attached at all) and attach * to the interface indicated by the name stored in ifr. * Return an errno or 0. */ static int bpf_setif(struct bpf_d *d, struct ifreq *ifr) { struct bpf_if *bp; struct ifnet *theywant; BPF_LOCK_ASSERT(); theywant = ifunit(ifr->ifr_name); if (theywant == NULL || theywant->if_bpf == NULL) return (ENXIO); bp = theywant->if_bpf; /* * At this point, we expect the buffer is already allocated. If not, * return an error. */ switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: case BPF_BUFMODE_ZBUF: if (d->bd_sbuf == NULL) return (EINVAL); break; default: panic("bpf_setif: bufmode %d", d->bd_bufmode); } if (bp != d->bd_bif) bpf_attachd(d, bp); else { BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); } return (0); } /* * Support for select() and poll() system calls * * Return true iff the specific operation will not block indefinitely. * Otherwise, return false but make a note that a selwakeup() must be done. */ static int bpfpoll(struct cdev *dev, int events, struct thread *td) { struct bpf_d *d; int revents; if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL) return (events & (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM)); /* * Refresh PID associated with this descriptor. */ revents = events & (POLLOUT | POLLWRNORM); BPFD_LOCK(d); BPF_PID_REFRESH(d, td); if (events & (POLLIN | POLLRDNORM)) { if (bpf_ready(d)) revents |= events & (POLLIN | POLLRDNORM); else { selrecord(td, &d->bd_sel); /* Start the read timeout if necessary. */ if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } } } BPFD_UNLOCK(d); return (revents); } /* * Support for kevent() system call. Register EVFILT_READ filters and * reject all others. */ int bpfkqfilter(struct cdev *dev, struct knote *kn) { struct bpf_d *d; if (devfs_get_cdevpriv((void **)&d) != 0) return (1); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &bpfread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &bpfwrite_filtops; break; default: return (1); } /* * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); kn->kn_hook = d; knlist_add(&d->bd_sel.si_note, kn, 1); BPFD_UNLOCK(d); return (0); } static void filt_bpfdetach(struct knote *kn) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; knlist_remove(&d->bd_sel.si_note, kn, 0); } static int filt_bpfread(struct knote *kn, long hint) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; int ready; BPFD_LOCK_ASSERT(d); ready = bpf_ready(d); if (ready) { kn->kn_data = d->bd_slen; /* * Ignore the hold buffer if it is being copied to user space. */ if (!d->bd_hbuf_in_use && d->bd_hbuf) kn->kn_data += d->bd_hlen; } else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } return (ready); } static int filt_bpfwrite(struct knote *kn, long hint) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; BPFD_LOCK_ASSERT(d); if (d->bd_bif == NULL) { kn->kn_data = 0; return (0); } else { kn->kn_data = d->bd_bif->bif_ifp->if_mtu; return (1); } } #define BPF_TSTAMP_NONE 0 #define BPF_TSTAMP_FAST 1 #define BPF_TSTAMP_NORMAL 2 #define BPF_TSTAMP_EXTERN 3 static int bpf_ts_quality(int tstype) { if (tstype == BPF_T_NONE) return (BPF_TSTAMP_NONE); if ((tstype & BPF_T_FAST) != 0) return (BPF_TSTAMP_FAST); return (BPF_TSTAMP_NORMAL); } static int bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m) { struct timespec ts; struct m_tag *tag; int quality; quality = bpf_ts_quality(tstype); if (quality == BPF_TSTAMP_NONE) return (quality); if (m != NULL) { if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts); timespec2bintime(&ts, bt); return (BPF_TSTAMP_EXTERN); } tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL); if (tag != NULL) { *bt = *(struct bintime *)(tag + 1); return (BPF_TSTAMP_EXTERN); } } if (quality == BPF_TSTAMP_NORMAL) binuptime(bt); else getbinuptime(bt); return (quality); } /* * Incoming linkage from device drivers. Process the packet pkt, of length * pktlen, which is stored in a contiguous buffer. The packet is parsed * by each process' filter, and if accepted, stashed into the corresponding * buffer. */ void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { struct epoch_tracker et; struct bintime bt; struct bpf_d *d; #ifdef BPF_JITTER bpf_jit_filter *bf; #endif u_int slen; int gottime; gottime = BPF_TSTAMP_NONE; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { counter_u64_add(d->bd_rcount, 1); /* * NB: We dont call BPF_CHECK_DIRECTION() here since there * is no way for the caller to indiciate to us whether this * packet is inbound or outbound. In the bpf_mtap() routines, * we use the interface pointers on the mbuf to figure it out. */ #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; if (bf != NULL) slen = (*(bf->func))(pkt, pktlen, pktlen); else #endif slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen); if (slen != 0) { /* * Filter matches. Let's to acquire write lock. */ BPFD_LOCK(d); counter_u64_add(d->bd_fcount, 1); if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, NULL); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, pkt, pktlen, slen, bpf_append_bytes, &bt); BPFD_UNLOCK(d); } } NET_EPOCH_EXIT(et); } void bpf_tap_if(if_t ifp, u_char *pkt, u_int pktlen) { if (bpf_peers_present(ifp->if_bpf)) bpf_tap(ifp->if_bpf, pkt, pktlen); } #define BPF_CHECK_DIRECTION(d, r, i) \ (((d)->bd_direction == BPF_D_IN && (r) != (i)) || \ ((d)->bd_direction == BPF_D_OUT && (r) == (i))) /* * Incoming linkage from device drivers, when packet is in an mbuf chain. * Locking model is explained in bpf_tap(). */ void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { struct epoch_tracker et; struct bintime bt; struct bpf_d *d; #ifdef BPF_JITTER bpf_jit_filter *bf; #endif u_int pktlen, slen; int gottime; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m_rcvif(m) == NULL) { m->m_flags &= ~M_PROMISC; return; } pktlen = m_length(m, NULL); gottime = BPF_TSTAMP_NONE; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m_rcvif(m), bp->bif_ifp)) continue; counter_u64_add(d->bd_rcount, 1); #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; /* XXX We cannot handle multiple mbufs. */ if (bf != NULL && m->m_next == NULL) slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen); else #endif slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0); if (slen != 0) { BPFD_LOCK(d); counter_u64_add(d->bd_fcount, 1); if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, m); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)m, pktlen, slen, bpf_append_mbuf, &bt); BPFD_UNLOCK(d); } } NET_EPOCH_EXIT(et); } void bpf_mtap_if(if_t ifp, struct mbuf *m) { if (bpf_peers_present(ifp->if_bpf)) { M_ASSERTVALID(m); bpf_mtap(ifp->if_bpf, m); } } /* * Incoming linkage from device drivers, when packet is in * an mbuf chain and to be prepended by a contiguous header. */ void bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) { struct epoch_tracker et; struct bintime bt; struct mbuf mb; struct bpf_d *d; u_int pktlen, slen; int gottime; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) { m->m_flags &= ~M_PROMISC; return; } pktlen = m_length(m, NULL); /* * Craft on-stack mbuf suitable for passing to bpf_filter. * Note that we cut corners here; we only setup what's * absolutely needed--this mbuf should never go anywhere else. */ mb.m_flags = 0; mb.m_next = m; mb.m_data = data; mb.m_len = dlen; pktlen += dlen; gottime = BPF_TSTAMP_NONE; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) continue; counter_u64_add(d->bd_rcount, 1); slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0); if (slen != 0) { BPFD_LOCK(d); counter_u64_add(d->bd_fcount, 1); if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, m); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)&mb, pktlen, slen, bpf_append_mbuf, &bt); BPFD_UNLOCK(d); } } NET_EPOCH_EXIT(et); } void bpf_mtap2_if(if_t ifp, void *data, u_int dlen, struct mbuf *m) { if (bpf_peers_present(ifp->if_bpf)) { M_ASSERTVALID(m); bpf_mtap2(ifp->if_bpf, data, dlen, m); } } #undef BPF_CHECK_DIRECTION #undef BPF_TSTAMP_NONE #undef BPF_TSTAMP_FAST #undef BPF_TSTAMP_NORMAL #undef BPF_TSTAMP_EXTERN static int bpf_hdrlen(struct bpf_d *d) { int hdrlen; hdrlen = d->bd_bif->bif_hdrlen; #ifndef BURN_BRIDGES if (d->bd_tstamp == BPF_T_NONE || BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME) #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32); else #endif hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr); else #endif hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr); #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) hdrlen = BPF_WORDALIGN32(hdrlen); else #endif hdrlen = BPF_WORDALIGN(hdrlen); return (hdrlen - d->bd_bif->bif_hdrlen); } static void bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype) { struct bintime bt2, boottimebin; struct timeval tsm; struct timespec tsn; if ((tstype & BPF_T_MONOTONIC) == 0) { bt2 = *bt; getboottimebin(&boottimebin); bintime_add(&bt2, &boottimebin); bt = &bt2; } switch (BPF_T_FORMAT(tstype)) { case BPF_T_MICROTIME: bintime2timeval(bt, &tsm); ts->bt_sec = tsm.tv_sec; ts->bt_frac = tsm.tv_usec; break; case BPF_T_NANOTIME: bintime2timespec(bt, &tsn); ts->bt_sec = tsn.tv_sec; ts->bt_frac = tsn.tv_nsec; break; case BPF_T_BINTIME: ts->bt_sec = bt->sec; ts->bt_frac = bt->frac; break; } } /* * Move the packet data from interface memory (pkt) into the * store buffer. "cpfn" is the routine called to do the actual data * transfer. bcopy is passed in to copy contiguous chunks, while * bpf_append_mbuf is passed in to copy mbuf chains. In the latter case, * pkt is really an mbuf. */ static void catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct bintime *bt) { static char zeroes[BPF_ALIGNMENT]; struct bpf_xhdr hdr; #ifndef BURN_BRIDGES struct bpf_hdr hdr_old; #ifdef COMPAT_FREEBSD32 struct bpf_hdr32 hdr32_old; #endif #endif int caplen, curlen, hdrlen, pad, totlen; int do_wakeup = 0; int do_timestamp; int tstype; BPFD_LOCK_ASSERT(d); if (d->bd_bif == NULL) { /* Descriptor was detached in concurrent thread */ counter_u64_add(d->bd_dcount, 1); return; } /* * Detect whether user space has released a buffer back to us, and if * so, move it from being a hold buffer to a free buffer. This may * not be the best place to do it (for example, we might only want to * run this check if we need the space), but for now it's a reliable * spot to do it. */ if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) { d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); } /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that * much. Otherwise, transfer the whole packet (unless * we hit the buffer size limit). */ hdrlen = bpf_hdrlen(d); totlen = hdrlen + min(snaplen, pktlen); if (totlen > d->bd_bufsize) totlen = d->bd_bufsize; /* * Round up the end of the previous packet to the next longword. * * Drop the packet if there's no room and no hope of room * If the packet would overflow the storage buffer or the storage * buffer is considered immutable by the buffer model, try to rotate * the buffer and wakeup pending processes. */ #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) curlen = BPF_WORDALIGN32(d->bd_slen); else #endif curlen = BPF_WORDALIGN(d->bd_slen); if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) { if (d->bd_fbuf == NULL) { /* * There's no room in the store buffer, and no * prospect of room, so drop the packet. Notify the * buffer model. */ bpf_buffull(d); counter_u64_add(d->bd_dcount, 1); return; } KASSERT(!d->bd_hbuf_in_use, ("hold buffer is in use")); ROTATE_BUFFERS(d); do_wakeup = 1; curlen = 0; } else { if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) { /* * Immediate mode is set, or the read timeout has * already expired during a select call. A packet * arrived, so the reader should be woken up. */ do_wakeup = 1; } pad = curlen - d->bd_slen; KASSERT(pad >= 0 && pad <= sizeof(zeroes), ("%s: invalid pad byte count %d", __func__, pad)); if (pad > 0) { /* Zero pad bytes. */ bpf_append_bytes(d, d->bd_sbuf, d->bd_slen, zeroes, pad); } } caplen = totlen - hdrlen; tstype = d->bd_tstamp; do_timestamp = tstype != BPF_T_NONE; #ifndef BURN_BRIDGES if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) { struct bpf_ts ts; if (do_timestamp) bpf_bintime2ts(bt, &ts, tstype); #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) { bzero(&hdr32_old, sizeof(hdr32_old)); if (do_timestamp) { hdr32_old.bh_tstamp.tv_sec = ts.bt_sec; hdr32_old.bh_tstamp.tv_usec = ts.bt_frac; } hdr32_old.bh_datalen = pktlen; hdr32_old.bh_hdrlen = hdrlen; hdr32_old.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old, sizeof(hdr32_old)); goto copy; } #endif bzero(&hdr_old, sizeof(hdr_old)); if (do_timestamp) { hdr_old.bh_tstamp.tv_sec = ts.bt_sec; hdr_old.bh_tstamp.tv_usec = ts.bt_frac; } hdr_old.bh_datalen = pktlen; hdr_old.bh_hdrlen = hdrlen; hdr_old.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old, sizeof(hdr_old)); goto copy; } #endif /* * Append the bpf header. Note we append the actual header size, but * move forward the length of the header plus padding. */ bzero(&hdr, sizeof(hdr)); if (do_timestamp) bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype); hdr.bh_datalen = pktlen; hdr.bh_hdrlen = hdrlen; hdr.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr)); /* * Copy the packet data into the store buffer and update its length. */ #ifndef BURN_BRIDGES copy: #endif (*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen); d->bd_slen = curlen + totlen; if (do_wakeup) bpf_wakeup(d); } /* * Free buffers currently in use by a descriptor. * Called on close. */ static void bpfd_free(epoch_context_t ctx) { struct bpf_d *d; struct bpf_program_buffer *p; /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ d = __containerof(ctx, struct bpf_d, epoch_ctx); bpf_free(d); if (d->bd_rfilter != NULL) { p = __containerof((void *)d->bd_rfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER p->func = d->bd_bfilter; #endif bpf_program_buffer_free(&p->epoch_ctx); } if (d->bd_wfilter != NULL) { p = __containerof((void *)d->bd_wfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER p->func = NULL; #endif bpf_program_buffer_free(&p->epoch_ctx); } mtx_destroy(&d->bd_lock); counter_u64_free(d->bd_rcount); counter_u64_free(d->bd_dcount); counter_u64_free(d->bd_fcount); counter_u64_free(d->bd_wcount); counter_u64_free(d->bd_wfcount); counter_u64_free(d->bd_wdcount); counter_u64_free(d->bd_zcopy); free(d, M_BPF); } /* * Attach an interface to bpf. dlt is the link layer type; hdrlen is the * fixed size of the link header (variable length headers not yet supported). */ void bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); } /* * Attach an interface to bpf. ifp is a pointer to the structure * defining the interface to be attached, dlt is the link layer type, * and hdrlen is the fixed size of the link header (variable length * headers are not yet supporrted). */ void bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { struct bpf_if *bp; KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized")); bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO); CK_LIST_INIT(&bp->bif_dlist); CK_LIST_INIT(&bp->bif_wlist); bp->bif_ifp = ifp; bp->bif_dlt = dlt; bp->bif_hdrlen = hdrlen; bp->bif_bpf = driverp; refcount_init(&bp->bif_refcnt, 1); *driverp = bp; /* * Reference ifnet pointer, so it won't freed until * we release it. */ if_ref(ifp); BPF_LOCK(); CK_LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next); BPF_UNLOCK(); if (bootverbose && IS_DEFAULT_VNET(curvnet)) if_printf(ifp, "bpf attached\n"); } #ifdef VIMAGE /* * When moving interfaces between vnet instances we need a way to * query the dlt and hdrlen before detach so we can re-attch the if_bpf * after the vmove. We unfortunately have no device driver infrastructure * to query the interface for these values after creation/attach, thus * add this as a workaround. */ int bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen) { if (bp == NULL) return (ENXIO); if (bif_dlt == NULL && bif_hdrlen == NULL) return (0); if (bif_dlt != NULL) *bif_dlt = bp->bif_dlt; if (bif_hdrlen != NULL) *bif_hdrlen = bp->bif_hdrlen; return (0); } #endif /* * Detach bpf from an interface. This involves detaching each descriptor * associated with the interface. Notify each descriptor as it's detached * so that any sleepers wake up and get ENXIO. */ void bpfdetach(struct ifnet *ifp) { struct bpf_if *bp, *bp_temp; struct bpf_d *d; BPF_LOCK(); /* Find all bpf_if struct's which reference ifp and detach them. */ CK_LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) { if (ifp != bp->bif_ifp) continue; CK_LIST_REMOVE(bp, bif_next); *bp->bif_bpf = (struct bpf_if *)&dead_bpf_if; CTR4(KTR_NET, "%s: sheduling free for encap %d (%p) for if %p", __func__, bp->bif_dlt, bp, ifp); /* Detach common descriptors */ while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) { bpf_detachd_locked(d, true); } /* Detach writer-only descriptors */ while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) { bpf_detachd_locked(d, true); } bpfif_rele(bp); } BPF_UNLOCK(); } +bool +bpf_peers_present_if(struct ifnet *ifp) +{ + struct bpf_if *bp = ifp->if_bpf; + + return (bpf_peers_present(bp) > 0); +} + /* * Get a list of available data link type of the interface. */ static int bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) { struct ifnet *ifp; struct bpf_if *bp; u_int *lst; int error, n, n1; BPF_LOCK_ASSERT(); ifp = d->bd_bif->bif_ifp; n1 = 0; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp) n1++; } if (bfl->bfl_list == NULL) { bfl->bfl_len = n1; return (0); } if (n1 > bfl->bfl_len) return (ENOMEM); lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK); n = 0; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp != ifp) continue; lst[n++] = bp->bif_dlt; } error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n); free(lst, M_TEMP); bfl->bfl_len = n; return (error); } /* * Set the data link type of a BPF instance. */ static int bpf_setdlt(struct bpf_d *d, u_int dlt) { int error, opromisc; struct ifnet *ifp; struct bpf_if *bp; BPF_LOCK_ASSERT(); MPASS(d->bd_bif != NULL); /* * It is safe to check bd_bif without BPFD_LOCK, it can not be * changed while we hold global lock. */ if (d->bd_bif->bif_dlt == dlt) return (0); ifp = d->bd_bif->bif_ifp; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) break; } if (bp == NULL) return (EINVAL); opromisc = d->bd_promisc; bpf_attachd(d, bp); if (opromisc) { error = ifpromisc(bp->bif_ifp, 1); if (error) if_printf(bp->bif_ifp, "%s: ifpromisc failed (%d)\n", __func__, error); else d->bd_promisc = 1; } return (0); } static void bpf_drvinit(void *unused) { struct cdev *dev; sx_init(&bpf_sx, "bpf global lock"); CK_LIST_INIT(&bpf_iflist); dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf"); /* For compatibility */ make_dev_alias(dev, "bpf0"); } /* * Zero out the various packet counters associated with all of the bpf * descriptors. At some point, we will probably want to get a bit more * granular and allow the user to specify descriptors to be zeroed. */ static void bpf_zero_counters(void) { struct bpf_if *bp; struct bpf_d *bd; BPF_LOCK(); /* * We are protected by global lock here, interfaces and * descriptors can not be deleted while we hold it. */ CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { counter_u64_zero(bd->bd_rcount); counter_u64_zero(bd->bd_dcount); counter_u64_zero(bd->bd_fcount); counter_u64_zero(bd->bd_wcount); counter_u64_zero(bd->bd_wfcount); counter_u64_zero(bd->bd_zcopy); } } BPF_UNLOCK(); } /* * Fill filter statistics */ static void bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) { BPF_LOCK_ASSERT(); bzero(d, sizeof(*d)); d->bd_structsize = sizeof(*d); d->bd_immediate = bd->bd_immediate; d->bd_promisc = bd->bd_promisc; d->bd_hdrcmplt = bd->bd_hdrcmplt; d->bd_direction = bd->bd_direction; d->bd_feedback = bd->bd_feedback; d->bd_async = bd->bd_async; d->bd_rcount = counter_u64_fetch(bd->bd_rcount); d->bd_dcount = counter_u64_fetch(bd->bd_dcount); d->bd_fcount = counter_u64_fetch(bd->bd_fcount); d->bd_sig = bd->bd_sig; d->bd_slen = bd->bd_slen; d->bd_hlen = bd->bd_hlen; d->bd_bufsize = bd->bd_bufsize; d->bd_pid = bd->bd_pid; strlcpy(d->bd_ifname, bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ); d->bd_locked = bd->bd_locked; d->bd_wcount = counter_u64_fetch(bd->bd_wcount); d->bd_wdcount = counter_u64_fetch(bd->bd_wdcount); d->bd_wfcount = counter_u64_fetch(bd->bd_wfcount); d->bd_zcopy = counter_u64_fetch(bd->bd_zcopy); d->bd_bufmode = bd->bd_bufmode; } /* * Handle `netstat -B' stats request */ static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS) { static const struct xbpf_d zerostats; struct xbpf_d *xbdbuf, *xbd, tempstats; int index, error; struct bpf_if *bp; struct bpf_d *bd; /* * XXX This is not technically correct. It is possible for non * privileged users to open bpf devices. It would make sense * if the users who opened the devices were able to retrieve * the statistics for them, too. */ error = priv_check(req->td, PRIV_NET_BPF); if (error) return (error); /* * Check to see if the user is requesting that the counters be * zeroed out. Explicitly check that the supplied data is zeroed, * as we aren't allowing the user to set the counters currently. */ if (req->newptr != NULL) { if (req->newlen != sizeof(tempstats)) return (EINVAL); memset(&tempstats, 0, sizeof(tempstats)); error = SYSCTL_IN(req, &tempstats, sizeof(tempstats)); if (error) return (error); if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0) return (EINVAL); bpf_zero_counters(); return (0); } if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd))); if (bpf_bpfd_cnt == 0) return (SYSCTL_OUT(req, 0, 0)); xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK); BPF_LOCK(); if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) { BPF_UNLOCK(); free(xbdbuf, M_BPF); return (ENOMEM); } index = 0; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { /* Send writers-only first */ CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next) { xbd = &xbdbuf[index++]; bpfstats_fill_xbpf(xbd, bd); } CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { xbd = &xbdbuf[index++]; bpfstats_fill_xbpf(xbd, bd); } } BPF_UNLOCK(); error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd)); free(xbdbuf, M_BPF); return (error); } SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL); #else /* !DEV_BPF && !NETGRAPH_BPF */ /* * NOP stubs to allow bpf-using drivers to load and function. * * A 'better' implementation would allow the core bpf functionality * to be loaded at runtime. */ void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { } void bpf_tap_if(if_t ifp, u_char *pkt, u_int pktlen) { } void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { } void bpf_mtap_if(if_t ifp, struct mbuf *m) { } void bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m) { } void bpf_mtap2_if(if_t ifp, void *data, u_int dlen, struct mbuf *m) { } void bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); } void bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { *driverp = (struct bpf_if *)&dead_bpf_if; } void bpfdetach(struct ifnet *ifp) { } +bool +bpf_peers_present_if(struct ifnet *ifp) +{ + return (false); +} + u_int bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) { return -1; /* "no filter" behaviour */ } int bpf_validate(const struct bpf_insn *f, int len) { return 0; /* false */ } #endif /* !DEV_BPF && !NETGRAPH_BPF */ #ifdef DDB static void bpf_show_bpf_if(struct bpf_if *bpf_if) { if (bpf_if == NULL) return; db_printf("%p:\n", bpf_if); #define BPF_DB_PRINTF(f, e) db_printf(" %s = " f "\n", #e, bpf_if->e); #define BPF_DB_PRINTF_RAW(f, e) db_printf(" %s = " f "\n", #e, e); /* bif_ext.bif_next */ /* bif_ext.bif_dlist */ BPF_DB_PRINTF("%#x", bif_dlt); BPF_DB_PRINTF("%u", bif_hdrlen); /* bif_wlist */ BPF_DB_PRINTF("%p", bif_ifp); BPF_DB_PRINTF("%p", bif_bpf); BPF_DB_PRINTF_RAW("%u", refcount_load(&bpf_if->bif_refcnt)); } DB_SHOW_COMMAND(bpf_if, db_show_bpf_if) { if (!have_addr) { db_printf("usage: show bpf_if \n"); return; } bpf_show_bpf_if((struct bpf_if *)addr); } #endif diff --git a/sys/net/bpf.h b/sys/net/bpf.h index 924dea5fc9f4..31968445aac1 100644 --- a/sys/net/bpf.h +++ b/sys/net/bpf.h @@ -1,467 +1,468 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence * Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)bpf.h 8.1 (Berkeley) 6/10/93 * @(#)bpf.h 1.34 (LBL) 6/16/96 */ #ifndef _NET_BPF_H_ #define _NET_BPF_H_ #include #include #include /* BSD style release date */ #define BPF_RELEASE 199606 typedef int32_t bpf_int32; typedef u_int32_t bpf_u_int32; typedef int64_t bpf_int64; typedef u_int64_t bpf_u_int64; struct ifnet; /* * Alignment macros. BPF_WORDALIGN rounds up to the next multiple of * BPF_ALIGNMENT. */ #define BPF_ALIGNMENT sizeof(long) #define BPF_WORDALIGN(x) (((x)+(BPF_ALIGNMENT-1))&~(BPF_ALIGNMENT-1)) #define BPF_MAXINSNS 512 #define BPF_MAXBUFSIZE 0x80000 #define BPF_MINBUFSIZE 32 /* * Structure for BIOCSETF. */ struct bpf_program { u_int bf_len; struct bpf_insn *bf_insns; }; /* * Struct returned by BIOCGSTATS. */ struct bpf_stat { u_int bs_recv; /* number of packets received */ u_int bs_drop; /* number of packets dropped */ }; /* * Struct return by BIOCVERSION. This represents the version number of * the filter language described by the instruction encodings below. * bpf understands a program iff kernel_major == filter_major && * kernel_minor >= filter_minor, that is, if the value returned by the * running kernel has the same major number and a minor number equal * equal to or less than the filter being downloaded. Otherwise, the * results are undefined, meaning an error may be returned or packets * may be accepted haphazardly. * It has nothing to do with the source code version. */ struct bpf_version { u_short bv_major; u_short bv_minor; }; /* Current version number of filter architecture. */ #define BPF_MAJOR_VERSION 1 #define BPF_MINOR_VERSION 1 /* * Historically, BPF has supported a single buffering model, first using mbuf * clusters in kernel, and later using malloc(9) buffers in kernel. We now * support multiple buffering modes, which may be queried and set using * BIOCGETBUFMODE and BIOCSETBUFMODE. So as to avoid handling the complexity * of changing modes while sniffing packets, the mode becomes fixed once an * interface has been attached to the BPF descriptor. */ #define BPF_BUFMODE_BUFFER 1 /* Kernel buffers with read(). */ #define BPF_BUFMODE_ZBUF 2 /* Zero-copy buffers. */ /*- * Struct used by BIOCSETZBUF, BIOCROTZBUF: describes up to two zero-copy * buffer as used by BPF. */ struct bpf_zbuf { void *bz_bufa; /* Location of 'a' zero-copy buffer. */ void *bz_bufb; /* Location of 'b' zero-copy buffer. */ size_t bz_buflen; /* Size of zero-copy buffers. */ }; #define BIOCGBLEN _IOR('B', 102, u_int) #define BIOCSBLEN _IOWR('B', 102, u_int) #define BIOCSETF _IOW('B', 103, struct bpf_program) #define BIOCFLUSH _IO('B', 104) #define BIOCPROMISC _IO('B', 105) #define BIOCGDLT _IOR('B', 106, u_int) #define BIOCGETIF _IOR('B', 107, struct ifreq) #define BIOCSETIF _IOW('B', 108, struct ifreq) #define BIOCSRTIMEOUT _IOW('B', 109, struct timeval) #define BIOCGRTIMEOUT _IOR('B', 110, struct timeval) #define BIOCGSTATS _IOR('B', 111, struct bpf_stat) #define BIOCIMMEDIATE _IOW('B', 112, u_int) #define BIOCVERSION _IOR('B', 113, struct bpf_version) #define BIOCGRSIG _IOR('B', 114, u_int) #define BIOCSRSIG _IOW('B', 115, u_int) #define BIOCGHDRCMPLT _IOR('B', 116, u_int) #define BIOCSHDRCMPLT _IOW('B', 117, u_int) #define BIOCGDIRECTION _IOR('B', 118, u_int) #define BIOCSDIRECTION _IOW('B', 119, u_int) #define BIOCSDLT _IOW('B', 120, u_int) #define BIOCGDLTLIST _IOWR('B', 121, struct bpf_dltlist) #define BIOCLOCK _IO('B', 122) #define BIOCSETWF _IOW('B', 123, struct bpf_program) #define BIOCFEEDBACK _IOW('B', 124, u_int) #define BIOCGETBUFMODE _IOR('B', 125, u_int) #define BIOCSETBUFMODE _IOW('B', 126, u_int) #define BIOCGETZMAX _IOR('B', 127, size_t) #define BIOCROTZBUF _IOR('B', 128, struct bpf_zbuf) #define BIOCSETZBUF _IOW('B', 129, struct bpf_zbuf) #define BIOCSETFNR _IOW('B', 130, struct bpf_program) #define BIOCGTSTAMP _IOR('B', 131, u_int) #define BIOCSTSTAMP _IOW('B', 132, u_int) #define BIOCSETVLANPCP _IOW('B', 133, u_int) /* Obsolete */ #define BIOCGSEESENT BIOCGDIRECTION #define BIOCSSEESENT BIOCSDIRECTION /* Packet directions */ enum bpf_direction { BPF_D_IN, /* See incoming packets */ BPF_D_INOUT, /* See incoming and outgoing packets */ BPF_D_OUT /* See outgoing packets */ }; /* Time stamping functions */ #define BPF_T_MICROTIME 0x0000 #define BPF_T_NANOTIME 0x0001 #define BPF_T_BINTIME 0x0002 #define BPF_T_NONE 0x0003 #define BPF_T_FORMAT_MASK 0x0003 #define BPF_T_NORMAL 0x0000 #define BPF_T_FAST 0x0100 #define BPF_T_MONOTONIC 0x0200 #define BPF_T_MONOTONIC_FAST (BPF_T_FAST | BPF_T_MONOTONIC) #define BPF_T_FLAG_MASK 0x0300 #define BPF_T_FORMAT(t) ((t) & BPF_T_FORMAT_MASK) #define BPF_T_FLAG(t) ((t) & BPF_T_FLAG_MASK) #define BPF_T_VALID(t) \ ((t) == BPF_T_NONE || (BPF_T_FORMAT(t) != BPF_T_NONE && \ ((t) & ~(BPF_T_FORMAT_MASK | BPF_T_FLAG_MASK)) == 0)) #define BPF_T_MICROTIME_FAST (BPF_T_MICROTIME | BPF_T_FAST) #define BPF_T_NANOTIME_FAST (BPF_T_NANOTIME | BPF_T_FAST) #define BPF_T_BINTIME_FAST (BPF_T_BINTIME | BPF_T_FAST) #define BPF_T_MICROTIME_MONOTONIC (BPF_T_MICROTIME | BPF_T_MONOTONIC) #define BPF_T_NANOTIME_MONOTONIC (BPF_T_NANOTIME | BPF_T_MONOTONIC) #define BPF_T_BINTIME_MONOTONIC (BPF_T_BINTIME | BPF_T_MONOTONIC) #define BPF_T_MICROTIME_MONOTONIC_FAST (BPF_T_MICROTIME | BPF_T_MONOTONIC_FAST) #define BPF_T_NANOTIME_MONOTONIC_FAST (BPF_T_NANOTIME | BPF_T_MONOTONIC_FAST) #define BPF_T_BINTIME_MONOTONIC_FAST (BPF_T_BINTIME | BPF_T_MONOTONIC_FAST) /* * Structure prepended to each packet. */ struct bpf_ts { bpf_int64 bt_sec; /* seconds */ bpf_u_int64 bt_frac; /* fraction */ }; struct bpf_xhdr { struct bpf_ts bh_tstamp; /* time stamp */ bpf_u_int32 bh_caplen; /* length of captured portion */ bpf_u_int32 bh_datalen; /* original length of packet */ u_short bh_hdrlen; /* length of bpf header (this struct plus alignment padding) */ }; /* Obsolete */ struct bpf_hdr { struct timeval bh_tstamp; /* time stamp */ bpf_u_int32 bh_caplen; /* length of captured portion */ bpf_u_int32 bh_datalen; /* original length of packet */ u_short bh_hdrlen; /* length of bpf header (this struct plus alignment padding) */ }; #ifdef _KERNEL #define MTAG_BPF 0x627066 #define MTAG_BPF_TIMESTAMP 0 #endif /* * When using zero-copy BPF buffers, a shared memory header is present * allowing the kernel BPF implementation and user process to synchronize * without using system calls. This structure defines that header. When * accessing these fields, appropriate atomic operation and memory barriers * are required in order not to see stale or out-of-order data; see bpf(4) * for reference code to access these fields from userspace. * * The layout of this structure is critical, and must not be changed; if must * fit in a single page on all architectures. */ struct bpf_zbuf_header { volatile u_int bzh_kernel_gen; /* Kernel generation number. */ volatile u_int bzh_kernel_len; /* Length of data in the buffer. */ volatile u_int bzh_user_gen; /* User generation number. */ u_int _bzh_pad[5]; }; /* * The instruction encodings. * * Please inform tcpdump-workers@lists.tcpdump.org if you use any * of the reserved values, so that we can note that they're used * (and perhaps implement it in the reference BPF implementation * and encourage its implementation elsewhere). */ /* * The upper 8 bits of the opcode aren't used. BSD/OS used 0x8000. */ /* instruction classes */ #define BPF_CLASS(code) ((code) & 0x07) #define BPF_LD 0x00 #define BPF_LDX 0x01 #define BPF_ST 0x02 #define BPF_STX 0x03 #define BPF_ALU 0x04 #define BPF_JMP 0x05 #define BPF_RET 0x06 #define BPF_MISC 0x07 /* ld/ldx fields */ #define BPF_SIZE(code) ((code) & 0x18) #define BPF_W 0x00 #define BPF_H 0x08 #define BPF_B 0x10 /* 0x18 reserved; used by BSD/OS */ #define BPF_MODE(code) ((code) & 0xe0) #define BPF_IMM 0x00 #define BPF_ABS 0x20 #define BPF_IND 0x40 #define BPF_MEM 0x60 #define BPF_LEN 0x80 #define BPF_MSH 0xa0 /* 0xc0 reserved; used by BSD/OS */ /* 0xe0 reserved; used by BSD/OS */ /* alu/jmp fields */ #define BPF_OP(code) ((code) & 0xf0) #define BPF_ADD 0x00 #define BPF_SUB 0x10 #define BPF_MUL 0x20 #define BPF_DIV 0x30 #define BPF_OR 0x40 #define BPF_AND 0x50 #define BPF_LSH 0x60 #define BPF_RSH 0x70 #define BPF_NEG 0x80 #define BPF_MOD 0x90 #define BPF_XOR 0xa0 /* 0xb0 reserved */ /* 0xc0 reserved */ /* 0xd0 reserved */ /* 0xe0 reserved */ /* 0xf0 reserved */ #define BPF_JA 0x00 #define BPF_JEQ 0x10 #define BPF_JGT 0x20 #define BPF_JGE 0x30 #define BPF_JSET 0x40 /* 0x50 reserved; used on BSD/OS */ /* 0x60 reserved */ /* 0x70 reserved */ /* 0x80 reserved */ /* 0x90 reserved */ /* 0xa0 reserved */ /* 0xb0 reserved */ /* 0xc0 reserved */ /* 0xd0 reserved */ /* 0xe0 reserved */ /* 0xf0 reserved */ #define BPF_SRC(code) ((code) & 0x08) #define BPF_K 0x00 #define BPF_X 0x08 /* ret - BPF_K and BPF_X also apply */ #define BPF_RVAL(code) ((code) & 0x18) #define BPF_A 0x10 /* 0x18 reserved */ /* misc */ #define BPF_MISCOP(code) ((code) & 0xf8) #define BPF_TAX 0x00 /* 0x08 reserved */ /* 0x10 reserved */ /* 0x18 reserved */ /* #define BPF_COP 0x20 NetBSD "coprocessor" extensions */ /* 0x28 reserved */ /* 0x30 reserved */ /* 0x38 reserved */ /* #define BPF_COPX 0x40 NetBSD "coprocessor" extensions */ /* also used on BSD/OS */ /* 0x48 reserved */ /* 0x50 reserved */ /* 0x58 reserved */ /* 0x60 reserved */ /* 0x68 reserved */ /* 0x70 reserved */ /* 0x78 reserved */ #define BPF_TXA 0x80 /* 0x88 reserved */ /* 0x90 reserved */ /* 0x98 reserved */ /* 0xa0 reserved */ /* 0xa8 reserved */ /* 0xb0 reserved */ /* 0xb8 reserved */ /* 0xc0 reserved; used on BSD/OS */ /* 0xc8 reserved */ /* 0xd0 reserved */ /* 0xd8 reserved */ /* 0xe0 reserved */ /* 0xe8 reserved */ /* 0xf0 reserved */ /* 0xf8 reserved */ /* * The instruction data structure. */ struct bpf_insn { u_short code; u_char jt; u_char jf; bpf_u_int32 k; }; /* * Macros for insn array initializers. */ #define BPF_STMT(code, k) { (u_short)(code), 0, 0, k } #define BPF_JUMP(code, k, jt, jf) { (u_short)(code), jt, jf, k } /* * Structure to retrieve available DLTs for the interface. */ struct bpf_dltlist { u_int bfl_len; /* number of bfd_list array */ u_int *bfl_list; /* array of DLTs */ }; #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_BPF); #endif #ifdef SYSCTL_DECL SYSCTL_DECL(_net_bpf); #endif /* * Rotate the packet buffers in descriptor d. Move the store buffer into the * hold slot, and the free buffer into the store slot. Zero the length of the * new store buffer. Descriptor lock should be held. One must be careful to * not rotate the buffers twice, i.e. if fbuf != NULL. */ #define ROTATE_BUFFERS(d) do { \ (d)->bd_hbuf = (d)->bd_sbuf; \ (d)->bd_hlen = (d)->bd_slen; \ (d)->bd_sbuf = (d)->bd_fbuf; \ (d)->bd_slen = 0; \ (d)->bd_fbuf = NULL; \ bpf_bufheld(d); \ } while (0) /* * Descriptor associated with each attached hardware interface. * Part of this structure is exposed to external callers to speed up * bpf_peers_present() calls. */ struct bpf_if; CK_LIST_HEAD(bpfd_list, bpf_d); struct bpf_if_ext { CK_LIST_ENTRY(bpf_if) bif_next; /* list of all interfaces */ struct bpfd_list bif_dlist; /* descriptor list */ }; void bpf_bufheld(struct bpf_d *d); int bpf_validate(const struct bpf_insn *, int); void bpf_tap(struct bpf_if *, u_char *, u_int); void bpf_tap_if(struct ifnet *, u_char *, u_int); void bpf_mtap(struct bpf_if *, struct mbuf *); void bpf_mtap_if(struct ifnet *, struct mbuf *); void bpf_mtap2(struct bpf_if *, void *, u_int, struct mbuf *); void bpf_mtap2_if(struct ifnet *, void *, u_int, struct mbuf *); void bpfattach(struct ifnet *, u_int, u_int); void bpfattach2(struct ifnet *, u_int, u_int, struct bpf_if **); void bpfdetach(struct ifnet *); +bool bpf_peers_present_if(struct ifnet *); #ifdef VIMAGE int bpf_get_bp_params(struct bpf_if *, u_int *, u_int *); #endif void bpfilterattach(int); u_int bpf_filter(const struct bpf_insn *, u_char *, u_int, u_int); static __inline int bpf_peers_present(struct bpf_if *bpf) { struct bpf_if_ext *ext; ext = (struct bpf_if_ext *)bpf; if (!CK_LIST_EMPTY(&ext->bif_dlist)) return (1); return (0); } #define BPF_TAP(_ifp,_pkt,_pktlen) \ bpf_tap_if((_ifp), (_pkt), (_pktlen)) #define BPF_MTAP(_ifp,_m) \ bpf_mtap_if((_ifp), (_m)) #define BPF_MTAP2(_ifp,_data,_dlen,_m) \ bpf_mtap2_if((_ifp), (_data), (_dlen), (_m)) #endif /* * Number of scratch memory words (for BPF_LD|BPF_MEM and BPF_ST). */ #define BPF_MEMWORDS 16 /* BPF attach/detach events */ typedef void (*bpf_track_fn)(void *, struct ifnet *, int /* dlt */, int /* 1 =>'s attach */); EVENTHANDLER_DECLARE(bpf_track, bpf_track_fn); #endif /* _NET_BPF_H_ */