Index: head/lib/libc/sys/sendfile.2 =================================================================== --- head/lib/libc/sys/sendfile.2 (revision 293438) +++ head/lib/libc/sys/sendfile.2 (revision 293439) @@ -1,318 +1,377 @@ .\" Copyright (c) 2003, David G. Lawrence .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice unmodified, this list of conditions, and the following .\" disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd January 7, 2010 +.Dd January 7, 2016 .Dt SENDFILE 2 .Os .Sh NAME .Nm sendfile .Nd send a file to a socket .Sh LIBRARY .Lb libc .Sh SYNOPSIS .In sys/types.h .In sys/socket.h .In sys/uio.h .Ft int .Fo sendfile .Fa "int fd" "int s" "off_t offset" "size_t nbytes" .Fa "struct sf_hdtr *hdtr" "off_t *sbytes" "int flags" .Fc .Sh DESCRIPTION The .Fn sendfile system call -sends a regular file specified by descriptor +sends a regular file or shared memory object specified by descriptor .Fa fd out a stream socket specified by descriptor .Fa s . .Pp The .Fa offset argument specifies where to begin in the file. Should .Fa offset fall beyond the end of file, the system will return success and report 0 bytes sent as described below. The .Fa nbytes argument specifies how many bytes of the file should be sent, with 0 having the special meaning of send until the end of file has been reached. .Pp An optional header and/or trailer can be sent before and after the file data by specifying a pointer to a .Vt "struct sf_hdtr" , which has the following structure: .Pp .Bd -literal -offset indent -compact struct sf_hdtr { struct iovec *headers; /* pointer to header iovecs */ int hdr_cnt; /* number of header iovecs */ struct iovec *trailers; /* pointer to trailer iovecs */ int trl_cnt; /* number of trailer iovecs */ }; .Ed .Pp The .Fa headers and .Fa trailers pointers, if .Pf non- Dv NULL , point to arrays of .Vt "struct iovec" structures. See the .Fn writev system call for information on the iovec structure. The number of iovecs in these arrays is specified by .Fa hdr_cnt and .Fa trl_cnt . .Pp If .Pf non- Dv NULL , the system will write the total number of bytes sent on the socket to the variable pointed to by .Fa sbytes . .Pp -The +The least significant 16 bits of .Fa flags argument is a bitmap of these values: -.Bl -item -offset indent -.It -.Dv SF_NODISKIO . -This flag causes any -.Fn sendfile -call which would block on disk I/O to instead -return -.Er EBUSY . -Busy servers may benefit by transferring requests that would -block to a separate I/O worker thread. -.It -.Dv SF_MNOWAIT . -Do not wait for some kernel resource to become available, -in particular, -.Vt mbuf -and -.Vt sf_buf . -The flag does not make the -.Fn sendfile -syscall truly non-blocking, since other resources are still allocated -in a blocking fashion. -.It -.Dv SF_SYNC . +.Bl -tag -offset indent +.It Dv SF_NODISKIO +This flag causes .Nm +to return +.Er EBUSY +instead of blocking when a busy page is encountered. +This rare situation can happen if some other process is now working +with the same region of the file. +It is advised to retry the operation after a short period. +.Pp +Note that in older +.Fx +versions the +.Dv SF_NODISKIO +had slightly different notion. +The flag prevented +.Nm +to run I/O operations in case if an invalid (not cached) page is encountered, +thus avoiding blocking on I/O. +Starting with +.Fx 11 +.Nm +sending files off the +.Xr ffs 7 +filesystem doesn't block on I/O +(see +.Sx IMPLEMENTATION NOTES +), so the condition no longer applies. +However, it is safe if an application utilizes +.Dv SF_NODISKIO +and on +.Er EBUSY +performs the same action as it did in +older +.Fx +versions, e.g. +.Xr aio_read 2, +.Xr read 2 +or +.Nm +in a different context. +.It Dv SF_NOCACHE +The data sent to socket will not be cached by the virtual memory system, +and will be freed directly to the pool of free pages. +.It Dv SF_SYNC +.Nm sleeps until the network stack no longer references the VM pages of the file, making subsequent modifications to it safe. Please note that this is not a guarantee that the data has actually been sent. .El .Pp +The most significant 16 bits of +.Fa flags +specify amount of pages that +.Nm +may read ahead when reading the file. +A macro +.Fn SF_FLAGS +is provided to combine readahead amount and flags. +Example shows specifing readahead of 16 pages and +.Dv SF_NOCACHE +flag: +.Pp +.Bd -literal -offset indent -compact + SF_FLAGS(16, SF_NOCACHE) +.Ed +.Pp When using a socket marked for non-blocking I/O, .Fn sendfile may send fewer bytes than requested. In this case, the number of bytes successfully written is returned in .Fa *sbytes (if specified), and the error .Er EAGAIN is returned. .Sh IMPLEMENTATION NOTES The .Fx implementation of .Fn sendfile +doesn't block on disk I/O when it sends a file off the +.Xr ffs 7 +filesystem. +The syscall returns success before the actual I/O completes, and data +is put into the socket later unattended. +However, the order of data in the socket is preserved, so it is safe +to do further writes to the socket. +.Pp +The +.Fx +implementation of +.Fn sendfile is "zero-copy", meaning that it has been optimized so that copying of the file data is avoided. .Sh TUNING On some architectures, this system call internally uses a special .Fn sendfile buffer .Pq Vt "struct sf_buf" to handle sending file data to the client. If the sending socket is blocking, and there are not enough .Fn sendfile buffers available, .Fn sendfile will block and report a state of .Dq Li sfbufa . If the sending socket is non-blocking and there are not enough .Fn sendfile buffers available, the call will block and wait for the necessary buffers to become available before finishing the call. .Pp The number of .Vt sf_buf Ns 's allocated should be proportional to the number of nmbclusters used to send data to a client via .Fn sendfile . Tune accordingly to avoid blocking! Busy installations that make extensive use of .Fn sendfile may want to increase these values to be inline with their .Va kern.ipc.nmbclusters (see .Xr tuning 7 for details). .Pp The number of .Fn sendfile buffers available is determined at boot time by either the .Va kern.ipc.nsfbufs .Xr loader.conf 5 variable or the .Dv NSFBUFS kernel configuration tunable. The number of .Fn sendfile buffers scales with .Va kern.maxusers . The .Va kern.ipc.nsfbufsused and .Va kern.ipc.nsfbufspeak read-only .Xr sysctl 8 variables show current and peak .Fn sendfile buffers usage respectively. These values may also be viewed through .Nm netstat Fl m . .Pp If a value of zero is reported for .Va kern.ipc.nsfbufs , your architecture does not need to use .Fn sendfile buffers because their task can be efficiently performed by the generic virtual memory structures. .Sh RETURN VALUES .Rv -std sendfile .Sh ERRORS .Bl -tag -width Er .It Bq Er EAGAIN The socket is marked for non-blocking I/O and not all data was sent due to the socket buffer being filled. If specified, the number of bytes successfully sent will be returned in .Fa *sbytes . .It Bq Er EBADF The .Fa fd argument is not a valid file descriptor. .It Bq Er EBADF The .Fa s argument is not a valid socket descriptor. .It Bq Er EBUSY -Completing the entire transfer would have required disk I/O, so -it was aborted. -Partial data may have been sent. -(This error can only occur when +A busy page was encountered and .Dv SF_NODISKIO -is specified.) +had been specified. +Partial data may have been sent. .It Bq Er EFAULT An invalid address was specified for an argument. .It Bq Er EINTR A signal interrupted .Fn sendfile before it could be completed. If specified, the number of bytes successfully sent will be returned in .Fa *sbytes . .It Bq Er EINVAL The .Fa fd argument is not a regular file. .It Bq Er EINVAL The .Fa s argument is not a SOCK_STREAM type socket. .It Bq Er EINVAL The .Fa offset argument is negative. .It Bq Er EIO An error occurred while reading from .Fa fd . .It Bq Er ENOBUFS The system was unable to allocate an internal buffer. .It Bq Er ENOTCONN The .Fa s argument points to an unconnected socket. .It Bq Er ENOTSOCK The .Fa s argument is not a socket. .It Bq Er EOPNOTSUPP The file system for descriptor .Fa fd does not support .Fn sendfile . .It Bq Er EPIPE The socket peer has closed the connection. .El .Sh SEE ALSO .Xr netstat 1 , .Xr open 2 , .Xr send 2 , .Xr socket 2 , .Xr writev 2 , .Xr tuning 7 .Rs .%A K. Elmeleegy .%A A. Chanda .%A A. L. Cox .%A W. Zwaenepoel .%T A Portable Kernel Abstraction for Low-Overhead Ephemeral Mapping Management .%J The Proceedings of the 2005 USENIX Annual Technical Conference .%P pp 223-236 .%D 2005 .Re .Sh HISTORY The .Fn sendfile system call first appeared in .Fx 3.0 . This manual page first appeared in .Fx 3.1 . +In +.Fx 10 +support for sending shared memory descriptors had been introduced. +In +.Fx 11 +a non-blocking implementation had been introduced. .Sh AUTHORS -The +The initial implementation of .Fn sendfile system call and this manual page were written by .An David G. Lawrence Aq Mt dg@dglawrence.com . +The +.Fx 11 +implementation was written by +.An Gleb Smirnoff Aq Mt glebius@FreeBSD.org . Index: head/sys/dev/ti/if_ti.c =================================================================== --- head/sys/dev/ti/if_ti.c (revision 293438) +++ head/sys/dev/ti/if_ti.c (revision 293439) @@ -1,4057 +1,4057 @@ /*- * Copyright (c) 1997, 1998, 1999 * Bill Paul . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Bill Paul. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY Bill Paul AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL Bill Paul OR THE VOICES IN HIS HEAD * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * Alteon Networks Tigon PCI gigabit ethernet driver for FreeBSD. * Manuals, sample driver and firmware source kits are available * from http://www.alteon.com/support/openkits. * * Written by Bill Paul * Electrical Engineering Department * Columbia University, New York City */ /* * The Alteon Networks Tigon chip contains an embedded R4000 CPU, * gigabit MAC, dual DMA channels and a PCI interface unit. NICs * using the Tigon may have anywhere from 512K to 2MB of SRAM. The * Tigon supports hardware IP, TCP and UCP checksumming, multicast * filtering and jumbo (9014 byte) frames. The hardware is largely * controlled by firmware, which must be loaded into the NIC during * initialization. * * The Tigon 2 contains 2 R4000 CPUs and requires a newer firmware * revision, which supports new features such as extended commands, * extended jumbo receive ring desciptors and a mini receive ring. * * Alteon Networks is to be commended for releasing such a vast amount * of development material for the Tigon NIC without requiring an NDA * (although they really should have done it a long time ago). With * any luck, the other vendors will finally wise up and follow Alteon's * stellar example. * * The firmware for the Tigon 1 and 2 NICs is compiled directly into * this driver by #including it as a C header file. This bloats the * driver somewhat, but it's the easiest method considering that the * driver code and firmware code need to be kept in sync. The source * for the firmware is not provided with the FreeBSD distribution since * compiling it requires a GNU toolchain targeted for mips-sgi-irix5.3. * * The following people deserve special thanks: * - Terry Murphy of 3Com, for providing a 3c985 Tigon 1 board * for testing * - Raymond Lee of Netgear, for providing a pair of Netgear * GA620 Tigon 2 boards for testing * - Ulf Zimmermann, for bringing the GA260 to my attention and * convincing me to write this driver. * - Andrew Gallatin for providing FreeBSD/Alpha support. */ #include __FBSDID("$FreeBSD$"); #include "opt_ti.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TI_SF_BUF_JUMBO #include #include #endif #include #include #include #include #include #include #include #define TI_CSUM_FEATURES (CSUM_IP | CSUM_TCP | CSUM_UDP) /* * We can only turn on header splitting if we're using extended receive * BDs. */ #if defined(TI_JUMBO_HDRSPLIT) && !defined(TI_SF_BUF_JUMBO) #error "options TI_JUMBO_HDRSPLIT requires TI_SF_BUF_JUMBO" #endif /* TI_JUMBO_HDRSPLIT && !TI_SF_BUF_JUMBO */ typedef enum { TI_SWAP_HTON, TI_SWAP_NTOH } ti_swap_type; /* * Various supported device vendors/types and their names. */ static const struct ti_type ti_devs[] = { { ALT_VENDORID, ALT_DEVICEID_ACENIC, "Alteon AceNIC 1000baseSX Gigabit Ethernet" }, { ALT_VENDORID, ALT_DEVICEID_ACENIC_COPPER, "Alteon AceNIC 1000baseT Gigabit Ethernet" }, { TC_VENDORID, TC_DEVICEID_3C985, "3Com 3c985-SX Gigabit Ethernet" }, { NG_VENDORID, NG_DEVICEID_GA620, "Netgear GA620 1000baseSX Gigabit Ethernet" }, { NG_VENDORID, NG_DEVICEID_GA620T, "Netgear GA620 1000baseT Gigabit Ethernet" }, { SGI_VENDORID, SGI_DEVICEID_TIGON, "Silicon Graphics Gigabit Ethernet" }, { DEC_VENDORID, DEC_DEVICEID_FARALLON_PN9000SX, "Farallon PN9000SX Gigabit Ethernet" }, { 0, 0, NULL } }; static d_open_t ti_open; static d_close_t ti_close; static d_ioctl_t ti_ioctl2; static struct cdevsw ti_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_open = ti_open, .d_close = ti_close, .d_ioctl = ti_ioctl2, .d_name = "ti", }; static int ti_probe(device_t); static int ti_attach(device_t); static int ti_detach(device_t); static void ti_txeof(struct ti_softc *); static void ti_rxeof(struct ti_softc *); static int ti_encap(struct ti_softc *, struct mbuf **); static void ti_intr(void *); static void ti_start(struct ifnet *); static void ti_start_locked(struct ifnet *); static int ti_ioctl(struct ifnet *, u_long, caddr_t); static uint64_t ti_get_counter(struct ifnet *, ift_counter); static void ti_init(void *); static void ti_init_locked(void *); static void ti_init2(struct ti_softc *); static void ti_stop(struct ti_softc *); static void ti_watchdog(void *); static int ti_shutdown(device_t); static int ti_ifmedia_upd(struct ifnet *); static int ti_ifmedia_upd_locked(struct ti_softc *); static void ti_ifmedia_sts(struct ifnet *, struct ifmediareq *); static uint32_t ti_eeprom_putbyte(struct ti_softc *, int); static uint8_t ti_eeprom_getbyte(struct ti_softc *, int, uint8_t *); static int ti_read_eeprom(struct ti_softc *, caddr_t, int, int); static void ti_add_mcast(struct ti_softc *, struct ether_addr *); static void ti_del_mcast(struct ti_softc *, struct ether_addr *); static void ti_setmulti(struct ti_softc *); static void ti_mem_read(struct ti_softc *, uint32_t, uint32_t, void *); static void ti_mem_write(struct ti_softc *, uint32_t, uint32_t, void *); static void ti_mem_zero(struct ti_softc *, uint32_t, uint32_t); static int ti_copy_mem(struct ti_softc *, uint32_t, uint32_t, caddr_t, int, int); static int ti_copy_scratch(struct ti_softc *, uint32_t, uint32_t, caddr_t, int, int, int); static int ti_bcopy_swap(const void *, void *, size_t, ti_swap_type); static void ti_loadfw(struct ti_softc *); static void ti_cmd(struct ti_softc *, struct ti_cmd_desc *); static void ti_cmd_ext(struct ti_softc *, struct ti_cmd_desc *, caddr_t, int); static void ti_handle_events(struct ti_softc *); static void ti_dma_map_addr(void *, bus_dma_segment_t *, int, int); static int ti_dma_alloc(struct ti_softc *); static void ti_dma_free(struct ti_softc *); static int ti_dma_ring_alloc(struct ti_softc *, bus_size_t, bus_size_t, bus_dma_tag_t *, uint8_t **, bus_dmamap_t *, bus_addr_t *, const char *); static void ti_dma_ring_free(struct ti_softc *, bus_dma_tag_t *, uint8_t **, bus_dmamap_t, bus_addr_t *); static int ti_newbuf_std(struct ti_softc *, int); static int ti_newbuf_mini(struct ti_softc *, int); static int ti_newbuf_jumbo(struct ti_softc *, int, struct mbuf *); static int ti_init_rx_ring_std(struct ti_softc *); static void ti_free_rx_ring_std(struct ti_softc *); static int ti_init_rx_ring_jumbo(struct ti_softc *); static void ti_free_rx_ring_jumbo(struct ti_softc *); static int ti_init_rx_ring_mini(struct ti_softc *); static void ti_free_rx_ring_mini(struct ti_softc *); static void ti_free_tx_ring(struct ti_softc *); static int ti_init_tx_ring(struct ti_softc *); static void ti_discard_std(struct ti_softc *, int); #ifndef TI_SF_BUF_JUMBO static void ti_discard_jumbo(struct ti_softc *, int); #endif static void ti_discard_mini(struct ti_softc *, int); static int ti_64bitslot_war(struct ti_softc *); static int ti_chipinit(struct ti_softc *); static int ti_gibinit(struct ti_softc *); #ifdef TI_JUMBO_HDRSPLIT static __inline void ti_hdr_split(struct mbuf *top, int hdr_len, int pkt_len, int idx); #endif /* TI_JUMBO_HDRSPLIT */ static void ti_sysctl_node(struct ti_softc *); static device_method_t ti_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ti_probe), DEVMETHOD(device_attach, ti_attach), DEVMETHOD(device_detach, ti_detach), DEVMETHOD(device_shutdown, ti_shutdown), { 0, 0 } }; static driver_t ti_driver = { "ti", ti_methods, sizeof(struct ti_softc) }; static devclass_t ti_devclass; DRIVER_MODULE(ti, pci, ti_driver, ti_devclass, 0, 0); MODULE_DEPEND(ti, pci, 1, 1, 1); MODULE_DEPEND(ti, ether, 1, 1, 1); /* * Send an instruction or address to the EEPROM, check for ACK. */ static uint32_t ti_eeprom_putbyte(struct ti_softc *sc, int byte) { int i, ack = 0; /* * Make sure we're in TX mode. */ TI_SETBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_TXEN); /* * Feed in each bit and stobe the clock. */ for (i = 0x80; i; i >>= 1) { if (byte & i) { TI_SETBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_DOUT); } else { TI_CLRBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_DOUT); } DELAY(1); TI_SETBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_CLK); DELAY(1); TI_CLRBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_CLK); } /* * Turn off TX mode. */ TI_CLRBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_TXEN); /* * Check for ack. */ TI_SETBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_CLK); ack = CSR_READ_4(sc, TI_MISC_LOCAL_CTL) & TI_MLC_EE_DIN; TI_CLRBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_CLK); return (ack); } /* * Read a byte of data stored in the EEPROM at address 'addr.' * We have to send two address bytes since the EEPROM can hold * more than 256 bytes of data. */ static uint8_t ti_eeprom_getbyte(struct ti_softc *sc, int addr, uint8_t *dest) { int i; uint8_t byte = 0; EEPROM_START; /* * Send write control code to EEPROM. */ if (ti_eeprom_putbyte(sc, EEPROM_CTL_WRITE)) { device_printf(sc->ti_dev, "failed to send write command, status: %x\n", CSR_READ_4(sc, TI_MISC_LOCAL_CTL)); return (1); } /* * Send first byte of address of byte we want to read. */ if (ti_eeprom_putbyte(sc, (addr >> 8) & 0xFF)) { device_printf(sc->ti_dev, "failed to send address, status: %x\n", CSR_READ_4(sc, TI_MISC_LOCAL_CTL)); return (1); } /* * Send second byte address of byte we want to read. */ if (ti_eeprom_putbyte(sc, addr & 0xFF)) { device_printf(sc->ti_dev, "failed to send address, status: %x\n", CSR_READ_4(sc, TI_MISC_LOCAL_CTL)); return (1); } EEPROM_STOP; EEPROM_START; /* * Send read control code to EEPROM. */ if (ti_eeprom_putbyte(sc, EEPROM_CTL_READ)) { device_printf(sc->ti_dev, "failed to send read command, status: %x\n", CSR_READ_4(sc, TI_MISC_LOCAL_CTL)); return (1); } /* * Start reading bits from EEPROM. */ TI_CLRBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_TXEN); for (i = 0x80; i; i >>= 1) { TI_SETBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_CLK); DELAY(1); if (CSR_READ_4(sc, TI_MISC_LOCAL_CTL) & TI_MLC_EE_DIN) byte |= i; TI_CLRBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_EE_CLK); DELAY(1); } EEPROM_STOP; /* * No ACK generated for read, so just return byte. */ *dest = byte; return (0); } /* * Read a sequence of bytes from the EEPROM. */ static int ti_read_eeprom(struct ti_softc *sc, caddr_t dest, int off, int cnt) { int err = 0, i; uint8_t byte = 0; for (i = 0; i < cnt; i++) { err = ti_eeprom_getbyte(sc, off + i, &byte); if (err) break; *(dest + i) = byte; } return (err ? 1 : 0); } /* * NIC memory read function. * Can be used to copy data from NIC local memory. */ static void ti_mem_read(struct ti_softc *sc, uint32_t addr, uint32_t len, void *buf) { int segptr, segsize, cnt; char *ptr; segptr = addr; cnt = len; ptr = buf; while (cnt) { if (cnt < TI_WINLEN) segsize = cnt; else segsize = TI_WINLEN - (segptr % TI_WINLEN); CSR_WRITE_4(sc, TI_WINBASE, (segptr & ~(TI_WINLEN - 1))); bus_space_read_region_4(sc->ti_btag, sc->ti_bhandle, TI_WINDOW + (segptr & (TI_WINLEN - 1)), (uint32_t *)ptr, segsize / 4); ptr += segsize; segptr += segsize; cnt -= segsize; } } /* * NIC memory write function. * Can be used to copy data into NIC local memory. */ static void ti_mem_write(struct ti_softc *sc, uint32_t addr, uint32_t len, void *buf) { int segptr, segsize, cnt; char *ptr; segptr = addr; cnt = len; ptr = buf; while (cnt) { if (cnt < TI_WINLEN) segsize = cnt; else segsize = TI_WINLEN - (segptr % TI_WINLEN); CSR_WRITE_4(sc, TI_WINBASE, (segptr & ~(TI_WINLEN - 1))); bus_space_write_region_4(sc->ti_btag, sc->ti_bhandle, TI_WINDOW + (segptr & (TI_WINLEN - 1)), (uint32_t *)ptr, segsize / 4); ptr += segsize; segptr += segsize; cnt -= segsize; } } /* * NIC memory read function. * Can be used to clear a section of NIC local memory. */ static void ti_mem_zero(struct ti_softc *sc, uint32_t addr, uint32_t len) { int segptr, segsize, cnt; segptr = addr; cnt = len; while (cnt) { if (cnt < TI_WINLEN) segsize = cnt; else segsize = TI_WINLEN - (segptr % TI_WINLEN); CSR_WRITE_4(sc, TI_WINBASE, (segptr & ~(TI_WINLEN - 1))); bus_space_set_region_4(sc->ti_btag, sc->ti_bhandle, TI_WINDOW + (segptr & (TI_WINLEN - 1)), 0, segsize / 4); segptr += segsize; cnt -= segsize; } } static int ti_copy_mem(struct ti_softc *sc, uint32_t tigon_addr, uint32_t len, caddr_t buf, int useraddr, int readdata) { int segptr, segsize, cnt; caddr_t ptr; uint32_t origwin; int resid, segresid; int first_pass; TI_LOCK_ASSERT(sc); /* * At the moment, we don't handle non-aligned cases, we just bail. * If this proves to be a problem, it will be fixed. */ if (readdata == 0 && (tigon_addr & 0x3) != 0) { device_printf(sc->ti_dev, "%s: tigon address %#x isn't " "word-aligned\n", __func__, tigon_addr); device_printf(sc->ti_dev, "%s: unaligned writes aren't " "yet supported\n", __func__); return (EINVAL); } segptr = tigon_addr & ~0x3; segresid = tigon_addr - segptr; /* * This is the non-aligned amount left over that we'll need to * copy. */ resid = len & 0x3; /* Add in the left over amount at the front of the buffer */ resid += segresid; cnt = len & ~0x3; /* * If resid + segresid is >= 4, add multiples of 4 to the count and * decrease the residual by that much. */ cnt += resid & ~0x3; resid -= resid & ~0x3; ptr = buf; first_pass = 1; /* * Save the old window base value. */ origwin = CSR_READ_4(sc, TI_WINBASE); while (cnt) { bus_size_t ti_offset; if (cnt < TI_WINLEN) segsize = cnt; else segsize = TI_WINLEN - (segptr % TI_WINLEN); CSR_WRITE_4(sc, TI_WINBASE, (segptr & ~(TI_WINLEN - 1))); ti_offset = TI_WINDOW + (segptr & (TI_WINLEN -1)); if (readdata) { bus_space_read_region_4(sc->ti_btag, sc->ti_bhandle, ti_offset, (uint32_t *)sc->ti_membuf, segsize >> 2); if (useraddr) { /* * Yeah, this is a little on the kludgy * side, but at least this code is only * used for debugging. */ ti_bcopy_swap(sc->ti_membuf, sc->ti_membuf2, segsize, TI_SWAP_NTOH); TI_UNLOCK(sc); if (first_pass) { copyout(&sc->ti_membuf2[segresid], ptr, segsize - segresid); first_pass = 0; } else copyout(sc->ti_membuf2, ptr, segsize); TI_LOCK(sc); } else { if (first_pass) { ti_bcopy_swap(sc->ti_membuf, sc->ti_membuf2, segsize, TI_SWAP_NTOH); TI_UNLOCK(sc); bcopy(&sc->ti_membuf2[segresid], ptr, segsize - segresid); TI_LOCK(sc); first_pass = 0; } else ti_bcopy_swap(sc->ti_membuf, ptr, segsize, TI_SWAP_NTOH); } } else { if (useraddr) { TI_UNLOCK(sc); copyin(ptr, sc->ti_membuf2, segsize); TI_LOCK(sc); ti_bcopy_swap(sc->ti_membuf2, sc->ti_membuf, segsize, TI_SWAP_HTON); } else ti_bcopy_swap(ptr, sc->ti_membuf, segsize, TI_SWAP_HTON); bus_space_write_region_4(sc->ti_btag, sc->ti_bhandle, ti_offset, (uint32_t *)sc->ti_membuf, segsize >> 2); } segptr += segsize; ptr += segsize; cnt -= segsize; } /* * Handle leftover, non-word-aligned bytes. */ if (resid != 0) { uint32_t tmpval, tmpval2; bus_size_t ti_offset; /* * Set the segment pointer. */ CSR_WRITE_4(sc, TI_WINBASE, (segptr & ~(TI_WINLEN - 1))); ti_offset = TI_WINDOW + (segptr & (TI_WINLEN - 1)); /* * First, grab whatever is in our source/destination. * We'll obviously need this for reads, but also for * writes, since we'll be doing read/modify/write. */ bus_space_read_region_4(sc->ti_btag, sc->ti_bhandle, ti_offset, &tmpval, 1); /* * Next, translate this from little-endian to big-endian * (at least on i386 boxes). */ tmpval2 = ntohl(tmpval); if (readdata) { /* * If we're reading, just copy the leftover number * of bytes from the host byte order buffer to * the user's buffer. */ if (useraddr) { TI_UNLOCK(sc); copyout(&tmpval2, ptr, resid); TI_LOCK(sc); } else bcopy(&tmpval2, ptr, resid); } else { /* * If we're writing, first copy the bytes to be * written into the network byte order buffer, * leaving the rest of the buffer with whatever was * originally in there. Then, swap the bytes * around into host order and write them out. * * XXX KDM the read side of this has been verified * to work, but the write side of it has not been * verified. So user beware. */ if (useraddr) { TI_UNLOCK(sc); copyin(ptr, &tmpval2, resid); TI_LOCK(sc); } else bcopy(ptr, &tmpval2, resid); tmpval = htonl(tmpval2); bus_space_write_region_4(sc->ti_btag, sc->ti_bhandle, ti_offset, &tmpval, 1); } } CSR_WRITE_4(sc, TI_WINBASE, origwin); return (0); } static int ti_copy_scratch(struct ti_softc *sc, uint32_t tigon_addr, uint32_t len, caddr_t buf, int useraddr, int readdata, int cpu) { uint32_t segptr; int cnt; uint32_t tmpval, tmpval2; caddr_t ptr; TI_LOCK_ASSERT(sc); /* * At the moment, we don't handle non-aligned cases, we just bail. * If this proves to be a problem, it will be fixed. */ if (tigon_addr & 0x3) { device_printf(sc->ti_dev, "%s: tigon address %#x " "isn't word-aligned\n", __func__, tigon_addr); return (EINVAL); } if (len & 0x3) { device_printf(sc->ti_dev, "%s: transfer length %d " "isn't word-aligned\n", __func__, len); return (EINVAL); } segptr = tigon_addr; cnt = len; ptr = buf; while (cnt) { CSR_WRITE_4(sc, CPU_REG(TI_SRAM_ADDR, cpu), segptr); if (readdata) { tmpval2 = CSR_READ_4(sc, CPU_REG(TI_SRAM_DATA, cpu)); tmpval = ntohl(tmpval2); /* * Note: I've used this debugging interface * extensively with Alteon's 12.3.15 firmware, * compiled with GCC 2.7.2.1 and binutils 2.9.1. * * When you compile the firmware without * optimization, which is necessary sometimes in * order to properly step through it, you sometimes * read out a bogus value of 0xc0017c instead of * whatever was supposed to be in that scratchpad * location. That value is on the stack somewhere, * but I've never been able to figure out what was * causing the problem. * * The address seems to pop up in random places, * often not in the same place on two subsequent * reads. * * In any case, the underlying data doesn't seem * to be affected, just the value read out. * * KDM, 3/7/2000 */ if (tmpval2 == 0xc0017c) device_printf(sc->ti_dev, "found 0xc0017c at " "%#x (tmpval2)\n", segptr); if (tmpval == 0xc0017c) device_printf(sc->ti_dev, "found 0xc0017c at " "%#x (tmpval)\n", segptr); if (useraddr) copyout(&tmpval, ptr, 4); else bcopy(&tmpval, ptr, 4); } else { if (useraddr) copyin(ptr, &tmpval2, 4); else bcopy(ptr, &tmpval2, 4); tmpval = htonl(tmpval2); CSR_WRITE_4(sc, CPU_REG(TI_SRAM_DATA, cpu), tmpval); } cnt -= 4; segptr += 4; ptr += 4; } return (0); } static int ti_bcopy_swap(const void *src, void *dst, size_t len, ti_swap_type swap_type) { const uint8_t *tmpsrc; uint8_t *tmpdst; size_t tmplen; if (len & 0x3) { printf("ti_bcopy_swap: length %zd isn't 32-bit aligned\n", len); return (-1); } tmpsrc = src; tmpdst = dst; tmplen = len; while (tmplen) { if (swap_type == TI_SWAP_NTOH) *(uint32_t *)tmpdst = ntohl(*(const uint32_t *)tmpsrc); else *(uint32_t *)tmpdst = htonl(*(const uint32_t *)tmpsrc); tmpsrc += 4; tmpdst += 4; tmplen -= 4; } return (0); } /* * Load firmware image into the NIC. Check that the firmware revision * is acceptable and see if we want the firmware for the Tigon 1 or * Tigon 2. */ static void ti_loadfw(struct ti_softc *sc) { TI_LOCK_ASSERT(sc); switch (sc->ti_hwrev) { case TI_HWREV_TIGON: if (tigonFwReleaseMajor != TI_FIRMWARE_MAJOR || tigonFwReleaseMinor != TI_FIRMWARE_MINOR || tigonFwReleaseFix != TI_FIRMWARE_FIX) { device_printf(sc->ti_dev, "firmware revision mismatch; " "want %d.%d.%d, got %d.%d.%d\n", TI_FIRMWARE_MAJOR, TI_FIRMWARE_MINOR, TI_FIRMWARE_FIX, tigonFwReleaseMajor, tigonFwReleaseMinor, tigonFwReleaseFix); return; } ti_mem_write(sc, tigonFwTextAddr, tigonFwTextLen, tigonFwText); ti_mem_write(sc, tigonFwDataAddr, tigonFwDataLen, tigonFwData); ti_mem_write(sc, tigonFwRodataAddr, tigonFwRodataLen, tigonFwRodata); ti_mem_zero(sc, tigonFwBssAddr, tigonFwBssLen); ti_mem_zero(sc, tigonFwSbssAddr, tigonFwSbssLen); CSR_WRITE_4(sc, TI_CPU_PROGRAM_COUNTER, tigonFwStartAddr); break; case TI_HWREV_TIGON_II: if (tigon2FwReleaseMajor != TI_FIRMWARE_MAJOR || tigon2FwReleaseMinor != TI_FIRMWARE_MINOR || tigon2FwReleaseFix != TI_FIRMWARE_FIX) { device_printf(sc->ti_dev, "firmware revision mismatch; " "want %d.%d.%d, got %d.%d.%d\n", TI_FIRMWARE_MAJOR, TI_FIRMWARE_MINOR, TI_FIRMWARE_FIX, tigon2FwReleaseMajor, tigon2FwReleaseMinor, tigon2FwReleaseFix); return; } ti_mem_write(sc, tigon2FwTextAddr, tigon2FwTextLen, tigon2FwText); ti_mem_write(sc, tigon2FwDataAddr, tigon2FwDataLen, tigon2FwData); ti_mem_write(sc, tigon2FwRodataAddr, tigon2FwRodataLen, tigon2FwRodata); ti_mem_zero(sc, tigon2FwBssAddr, tigon2FwBssLen); ti_mem_zero(sc, tigon2FwSbssAddr, tigon2FwSbssLen); CSR_WRITE_4(sc, TI_CPU_PROGRAM_COUNTER, tigon2FwStartAddr); break; default: device_printf(sc->ti_dev, "can't load firmware: unknown hardware rev\n"); break; } } /* * Send the NIC a command via the command ring. */ static void ti_cmd(struct ti_softc *sc, struct ti_cmd_desc *cmd) { int index; index = sc->ti_cmd_saved_prodidx; CSR_WRITE_4(sc, TI_GCR_CMDRING + (index * 4), *(uint32_t *)(cmd)); TI_INC(index, TI_CMD_RING_CNT); CSR_WRITE_4(sc, TI_MB_CMDPROD_IDX, index); sc->ti_cmd_saved_prodidx = index; } /* * Send the NIC an extended command. The 'len' parameter specifies the * number of command slots to include after the initial command. */ static void ti_cmd_ext(struct ti_softc *sc, struct ti_cmd_desc *cmd, caddr_t arg, int len) { int index; int i; index = sc->ti_cmd_saved_prodidx; CSR_WRITE_4(sc, TI_GCR_CMDRING + (index * 4), *(uint32_t *)(cmd)); TI_INC(index, TI_CMD_RING_CNT); for (i = 0; i < len; i++) { CSR_WRITE_4(sc, TI_GCR_CMDRING + (index * 4), *(uint32_t *)(&arg[i * 4])); TI_INC(index, TI_CMD_RING_CNT); } CSR_WRITE_4(sc, TI_MB_CMDPROD_IDX, index); sc->ti_cmd_saved_prodidx = index; } /* * Handle events that have triggered interrupts. */ static void ti_handle_events(struct ti_softc *sc) { struct ti_event_desc *e; if (sc->ti_rdata.ti_event_ring == NULL) return; bus_dmamap_sync(sc->ti_cdata.ti_event_ring_tag, sc->ti_cdata.ti_event_ring_map, BUS_DMASYNC_POSTREAD); while (sc->ti_ev_saved_considx != sc->ti_ev_prodidx.ti_idx) { e = &sc->ti_rdata.ti_event_ring[sc->ti_ev_saved_considx]; switch (TI_EVENT_EVENT(e)) { case TI_EV_LINKSTAT_CHANGED: sc->ti_linkstat = TI_EVENT_CODE(e); if (sc->ti_linkstat == TI_EV_CODE_LINK_UP) { if_link_state_change(sc->ti_ifp, LINK_STATE_UP); sc->ti_ifp->if_baudrate = IF_Mbps(100); if (bootverbose) device_printf(sc->ti_dev, "10/100 link up\n"); } else if (sc->ti_linkstat == TI_EV_CODE_GIG_LINK_UP) { if_link_state_change(sc->ti_ifp, LINK_STATE_UP); sc->ti_ifp->if_baudrate = IF_Gbps(1UL); if (bootverbose) device_printf(sc->ti_dev, "gigabit link up\n"); } else if (sc->ti_linkstat == TI_EV_CODE_LINK_DOWN) { if_link_state_change(sc->ti_ifp, LINK_STATE_DOWN); sc->ti_ifp->if_baudrate = 0; if (bootverbose) device_printf(sc->ti_dev, "link down\n"); } break; case TI_EV_ERROR: if (TI_EVENT_CODE(e) == TI_EV_CODE_ERR_INVAL_CMD) device_printf(sc->ti_dev, "invalid command\n"); else if (TI_EVENT_CODE(e) == TI_EV_CODE_ERR_UNIMP_CMD) device_printf(sc->ti_dev, "unknown command\n"); else if (TI_EVENT_CODE(e) == TI_EV_CODE_ERR_BADCFG) device_printf(sc->ti_dev, "bad config data\n"); break; case TI_EV_FIRMWARE_UP: ti_init2(sc); break; case TI_EV_STATS_UPDATED: case TI_EV_RESET_JUMBO_RING: case TI_EV_MCAST_UPDATED: /* Who cares. */ break; default: device_printf(sc->ti_dev, "unknown event: %d\n", TI_EVENT_EVENT(e)); break; } /* Advance the consumer index. */ TI_INC(sc->ti_ev_saved_considx, TI_EVENT_RING_CNT); CSR_WRITE_4(sc, TI_GCR_EVENTCONS_IDX, sc->ti_ev_saved_considx); } bus_dmamap_sync(sc->ti_cdata.ti_event_ring_tag, sc->ti_cdata.ti_event_ring_map, BUS_DMASYNC_PREREAD); } struct ti_dmamap_arg { bus_addr_t ti_busaddr; }; static void ti_dma_map_addr(void *arg, bus_dma_segment_t *segs, int nseg, int error) { struct ti_dmamap_arg *ctx; if (error) return; KASSERT(nseg == 1, ("%s: %d segments returned!", __func__, nseg)); ctx = arg; ctx->ti_busaddr = segs->ds_addr; } static int ti_dma_ring_alloc(struct ti_softc *sc, bus_size_t alignment, bus_size_t maxsize, bus_dma_tag_t *tag, uint8_t **ring, bus_dmamap_t *map, bus_addr_t *paddr, const char *msg) { struct ti_dmamap_arg ctx; int error; error = bus_dma_tag_create(sc->ti_cdata.ti_parent_tag, alignment, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, maxsize, 1, maxsize, 0, NULL, NULL, tag); if (error != 0) { device_printf(sc->ti_dev, "could not create %s dma tag\n", msg); return (error); } /* Allocate DMA'able memory for ring. */ error = bus_dmamem_alloc(*tag, (void **)ring, BUS_DMA_NOWAIT | BUS_DMA_ZERO | BUS_DMA_COHERENT, map); if (error != 0) { device_printf(sc->ti_dev, "could not allocate DMA'able memory for %s\n", msg); return (error); } /* Load the address of the ring. */ ctx.ti_busaddr = 0; error = bus_dmamap_load(*tag, *map, *ring, maxsize, ti_dma_map_addr, &ctx, BUS_DMA_NOWAIT); if (error != 0) { device_printf(sc->ti_dev, "could not load DMA'able memory for %s\n", msg); return (error); } *paddr = ctx.ti_busaddr; return (0); } static void ti_dma_ring_free(struct ti_softc *sc, bus_dma_tag_t *tag, uint8_t **ring, bus_dmamap_t map, bus_addr_t *paddr) { if (*paddr != 0) { bus_dmamap_unload(*tag, map); *paddr = 0; } if (*ring != NULL) { bus_dmamem_free(*tag, *ring, map); *ring = NULL; } if (*tag) { bus_dma_tag_destroy(*tag); *tag = NULL; } } static int ti_dma_alloc(struct ti_softc *sc) { bus_addr_t lowaddr; int i, error; lowaddr = BUS_SPACE_MAXADDR; if (sc->ti_dac == 0) lowaddr = BUS_SPACE_MAXADDR_32BIT; error = bus_dma_tag_create(bus_get_dma_tag(sc->ti_dev), 1, 0, lowaddr, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE_32BIT, 0, BUS_SPACE_MAXSIZE_32BIT, 0, NULL, NULL, &sc->ti_cdata.ti_parent_tag); if (error != 0) { device_printf(sc->ti_dev, "could not allocate parent dma tag\n"); return (ENOMEM); } error = ti_dma_ring_alloc(sc, TI_RING_ALIGN, sizeof(struct ti_gib), &sc->ti_cdata.ti_gib_tag, (uint8_t **)&sc->ti_rdata.ti_info, &sc->ti_cdata.ti_gib_map, &sc->ti_rdata.ti_info_paddr, "GIB"); if (error) return (error); /* Producer/consumer status */ error = ti_dma_ring_alloc(sc, TI_RING_ALIGN, sizeof(struct ti_status), &sc->ti_cdata.ti_status_tag, (uint8_t **)&sc->ti_rdata.ti_status, &sc->ti_cdata.ti_status_map, &sc->ti_rdata.ti_status_paddr, "event ring"); if (error) return (error); /* Event ring */ error = ti_dma_ring_alloc(sc, TI_RING_ALIGN, TI_EVENT_RING_SZ, &sc->ti_cdata.ti_event_ring_tag, (uint8_t **)&sc->ti_rdata.ti_event_ring, &sc->ti_cdata.ti_event_ring_map, &sc->ti_rdata.ti_event_ring_paddr, "event ring"); if (error) return (error); /* Command ring lives in shared memory so no need to create DMA area. */ /* Standard RX ring */ error = ti_dma_ring_alloc(sc, TI_RING_ALIGN, TI_STD_RX_RING_SZ, &sc->ti_cdata.ti_rx_std_ring_tag, (uint8_t **)&sc->ti_rdata.ti_rx_std_ring, &sc->ti_cdata.ti_rx_std_ring_map, &sc->ti_rdata.ti_rx_std_ring_paddr, "RX ring"); if (error) return (error); /* Jumbo RX ring */ error = ti_dma_ring_alloc(sc, TI_JUMBO_RING_ALIGN, TI_JUMBO_RX_RING_SZ, &sc->ti_cdata.ti_rx_jumbo_ring_tag, (uint8_t **)&sc->ti_rdata.ti_rx_jumbo_ring, &sc->ti_cdata.ti_rx_jumbo_ring_map, &sc->ti_rdata.ti_rx_jumbo_ring_paddr, "jumbo RX ring"); if (error) return (error); /* RX return ring */ error = ti_dma_ring_alloc(sc, TI_RING_ALIGN, TI_RX_RETURN_RING_SZ, &sc->ti_cdata.ti_rx_return_ring_tag, (uint8_t **)&sc->ti_rdata.ti_rx_return_ring, &sc->ti_cdata.ti_rx_return_ring_map, &sc->ti_rdata.ti_rx_return_ring_paddr, "RX return ring"); if (error) return (error); /* Create DMA tag for standard RX mbufs. */ error = bus_dma_tag_create(sc->ti_cdata.ti_parent_tag, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1, MCLBYTES, 0, NULL, NULL, &sc->ti_cdata.ti_rx_std_tag); if (error) { device_printf(sc->ti_dev, "could not allocate RX dma tag\n"); return (error); } /* Create DMA tag for jumbo RX mbufs. */ #ifdef TI_SF_BUF_JUMBO /* * The VM system will take care of providing aligned pages. Alignment * is set to 1 here so that busdma resources won't be wasted. */ error = bus_dma_tag_create(sc->ti_cdata.ti_parent_tag, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, PAGE_SIZE * 4, 4, PAGE_SIZE, 0, NULL, NULL, &sc->ti_cdata.ti_rx_jumbo_tag); #else error = bus_dma_tag_create(sc->ti_cdata.ti_parent_tag, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MJUM9BYTES, 1, MJUM9BYTES, 0, NULL, NULL, &sc->ti_cdata.ti_rx_jumbo_tag); #endif if (error) { device_printf(sc->ti_dev, "could not allocate jumbo RX dma tag\n"); return (error); } /* Create DMA tag for TX mbufs. */ error = bus_dma_tag_create(sc->ti_cdata.ti_parent_tag, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES * TI_MAXTXSEGS, TI_MAXTXSEGS, MCLBYTES, 0, NULL, NULL, &sc->ti_cdata.ti_tx_tag); if (error) { device_printf(sc->ti_dev, "could not allocate TX dma tag\n"); return (ENOMEM); } /* Create DMA maps for RX buffers. */ for (i = 0; i < TI_STD_RX_RING_CNT; i++) { error = bus_dmamap_create(sc->ti_cdata.ti_rx_std_tag, 0, &sc->ti_cdata.ti_rx_std_maps[i]); if (error) { device_printf(sc->ti_dev, "could not create DMA map for RX\n"); return (error); } } error = bus_dmamap_create(sc->ti_cdata.ti_rx_std_tag, 0, &sc->ti_cdata.ti_rx_std_sparemap); if (error) { device_printf(sc->ti_dev, "could not create spare DMA map for RX\n"); return (error); } /* Create DMA maps for jumbo RX buffers. */ for (i = 0; i < TI_JUMBO_RX_RING_CNT; i++) { error = bus_dmamap_create(sc->ti_cdata.ti_rx_jumbo_tag, 0, &sc->ti_cdata.ti_rx_jumbo_maps[i]); if (error) { device_printf(sc->ti_dev, "could not create DMA map for jumbo RX\n"); return (error); } } error = bus_dmamap_create(sc->ti_cdata.ti_rx_jumbo_tag, 0, &sc->ti_cdata.ti_rx_jumbo_sparemap); if (error) { device_printf(sc->ti_dev, "could not create spare DMA map for jumbo RX\n"); return (error); } /* Create DMA maps for TX buffers. */ for (i = 0; i < TI_TX_RING_CNT; i++) { error = bus_dmamap_create(sc->ti_cdata.ti_tx_tag, 0, &sc->ti_cdata.ti_txdesc[i].tx_dmamap); if (error) { device_printf(sc->ti_dev, "could not create DMA map for TX\n"); return (ENOMEM); } } /* Mini ring and TX ring is not available on Tigon 1. */ if (sc->ti_hwrev == TI_HWREV_TIGON) return (0); /* TX ring */ error = ti_dma_ring_alloc(sc, TI_RING_ALIGN, TI_TX_RING_SZ, &sc->ti_cdata.ti_tx_ring_tag, (uint8_t **)&sc->ti_rdata.ti_tx_ring, &sc->ti_cdata.ti_tx_ring_map, &sc->ti_rdata.ti_tx_ring_paddr, "TX ring"); if (error) return (error); /* Mini RX ring */ error = ti_dma_ring_alloc(sc, TI_RING_ALIGN, TI_MINI_RX_RING_SZ, &sc->ti_cdata.ti_rx_mini_ring_tag, (uint8_t **)&sc->ti_rdata.ti_rx_mini_ring, &sc->ti_cdata.ti_rx_mini_ring_map, &sc->ti_rdata.ti_rx_mini_ring_paddr, "mini RX ring"); if (error) return (error); /* Create DMA tag for mini RX mbufs. */ error = bus_dma_tag_create(sc->ti_cdata.ti_parent_tag, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MHLEN, 1, MHLEN, 0, NULL, NULL, &sc->ti_cdata.ti_rx_mini_tag); if (error) { device_printf(sc->ti_dev, "could not allocate mini RX dma tag\n"); return (error); } /* Create DMA maps for mini RX buffers. */ for (i = 0; i < TI_MINI_RX_RING_CNT; i++) { error = bus_dmamap_create(sc->ti_cdata.ti_rx_mini_tag, 0, &sc->ti_cdata.ti_rx_mini_maps[i]); if (error) { device_printf(sc->ti_dev, "could not create DMA map for mini RX\n"); return (error); } } error = bus_dmamap_create(sc->ti_cdata.ti_rx_mini_tag, 0, &sc->ti_cdata.ti_rx_mini_sparemap); if (error) { device_printf(sc->ti_dev, "could not create spare DMA map for mini RX\n"); return (error); } return (0); } static void ti_dma_free(struct ti_softc *sc) { int i; /* Destroy DMA maps for RX buffers. */ for (i = 0; i < TI_STD_RX_RING_CNT; i++) { if (sc->ti_cdata.ti_rx_std_maps[i]) { bus_dmamap_destroy(sc->ti_cdata.ti_rx_std_tag, sc->ti_cdata.ti_rx_std_maps[i]); sc->ti_cdata.ti_rx_std_maps[i] = NULL; } } if (sc->ti_cdata.ti_rx_std_sparemap) { bus_dmamap_destroy(sc->ti_cdata.ti_rx_std_tag, sc->ti_cdata.ti_rx_std_sparemap); sc->ti_cdata.ti_rx_std_sparemap = NULL; } if (sc->ti_cdata.ti_rx_std_tag) { bus_dma_tag_destroy(sc->ti_cdata.ti_rx_std_tag); sc->ti_cdata.ti_rx_std_tag = NULL; } /* Destroy DMA maps for jumbo RX buffers. */ for (i = 0; i < TI_JUMBO_RX_RING_CNT; i++) { if (sc->ti_cdata.ti_rx_jumbo_maps[i]) { bus_dmamap_destroy(sc->ti_cdata.ti_rx_jumbo_tag, sc->ti_cdata.ti_rx_jumbo_maps[i]); sc->ti_cdata.ti_rx_jumbo_maps[i] = NULL; } } if (sc->ti_cdata.ti_rx_jumbo_sparemap) { bus_dmamap_destroy(sc->ti_cdata.ti_rx_jumbo_tag, sc->ti_cdata.ti_rx_jumbo_sparemap); sc->ti_cdata.ti_rx_jumbo_sparemap = NULL; } if (sc->ti_cdata.ti_rx_jumbo_tag) { bus_dma_tag_destroy(sc->ti_cdata.ti_rx_jumbo_tag); sc->ti_cdata.ti_rx_jumbo_tag = NULL; } /* Destroy DMA maps for mini RX buffers. */ for (i = 0; i < TI_MINI_RX_RING_CNT; i++) { if (sc->ti_cdata.ti_rx_mini_maps[i]) { bus_dmamap_destroy(sc->ti_cdata.ti_rx_mini_tag, sc->ti_cdata.ti_rx_mini_maps[i]); sc->ti_cdata.ti_rx_mini_maps[i] = NULL; } } if (sc->ti_cdata.ti_rx_mini_sparemap) { bus_dmamap_destroy(sc->ti_cdata.ti_rx_mini_tag, sc->ti_cdata.ti_rx_mini_sparemap); sc->ti_cdata.ti_rx_mini_sparemap = NULL; } if (sc->ti_cdata.ti_rx_mini_tag) { bus_dma_tag_destroy(sc->ti_cdata.ti_rx_mini_tag); sc->ti_cdata.ti_rx_mini_tag = NULL; } /* Destroy DMA maps for TX buffers. */ for (i = 0; i < TI_TX_RING_CNT; i++) { if (sc->ti_cdata.ti_txdesc[i].tx_dmamap) { bus_dmamap_destroy(sc->ti_cdata.ti_tx_tag, sc->ti_cdata.ti_txdesc[i].tx_dmamap); sc->ti_cdata.ti_txdesc[i].tx_dmamap = NULL; } } if (sc->ti_cdata.ti_tx_tag) { bus_dma_tag_destroy(sc->ti_cdata.ti_tx_tag); sc->ti_cdata.ti_tx_tag = NULL; } /* Destroy standard RX ring. */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_rx_std_ring_tag, (void *)&sc->ti_rdata.ti_rx_std_ring, sc->ti_cdata.ti_rx_std_ring_map, &sc->ti_rdata.ti_rx_std_ring_paddr); /* Destroy jumbo RX ring. */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_rx_jumbo_ring_tag, (void *)&sc->ti_rdata.ti_rx_jumbo_ring, sc->ti_cdata.ti_rx_jumbo_ring_map, &sc->ti_rdata.ti_rx_jumbo_ring_paddr); /* Destroy mini RX ring. */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_rx_mini_ring_tag, (void *)&sc->ti_rdata.ti_rx_mini_ring, sc->ti_cdata.ti_rx_mini_ring_map, &sc->ti_rdata.ti_rx_mini_ring_paddr); /* Destroy RX return ring. */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_rx_return_ring_tag, (void *)&sc->ti_rdata.ti_rx_return_ring, sc->ti_cdata.ti_rx_return_ring_map, &sc->ti_rdata.ti_rx_return_ring_paddr); /* Destroy TX ring. */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_tx_ring_tag, (void *)&sc->ti_rdata.ti_tx_ring, sc->ti_cdata.ti_tx_ring_map, &sc->ti_rdata.ti_tx_ring_paddr); /* Destroy status block. */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_status_tag, (void *)&sc->ti_rdata.ti_status, sc->ti_cdata.ti_status_map, &sc->ti_rdata.ti_status_paddr); /* Destroy event ring. */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_event_ring_tag, (void *)&sc->ti_rdata.ti_event_ring, sc->ti_cdata.ti_event_ring_map, &sc->ti_rdata.ti_event_ring_paddr); /* Destroy GIB */ ti_dma_ring_free(sc, &sc->ti_cdata.ti_gib_tag, (void *)&sc->ti_rdata.ti_info, sc->ti_cdata.ti_gib_map, &sc->ti_rdata.ti_info_paddr); /* Destroy the parent tag. */ if (sc->ti_cdata.ti_parent_tag) { bus_dma_tag_destroy(sc->ti_cdata.ti_parent_tag); sc->ti_cdata.ti_parent_tag = NULL; } } /* * Intialize a standard receive ring descriptor. */ static int ti_newbuf_std(struct ti_softc *sc, int i) { bus_dmamap_t map; bus_dma_segment_t segs[1]; struct mbuf *m; struct ti_rx_desc *r; int error, nsegs; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) return (ENOBUFS); m->m_len = m->m_pkthdr.len = MCLBYTES; m_adj(m, ETHER_ALIGN); error = bus_dmamap_load_mbuf_sg(sc->ti_cdata.ti_rx_std_tag, sc->ti_cdata.ti_rx_std_sparemap, m, segs, &nsegs, 0); if (error != 0) { m_freem(m); return (error); } KASSERT(nsegs == 1, ("%s: %d segments returned!", __func__, nsegs)); if (sc->ti_cdata.ti_rx_std_chain[i] != NULL) { bus_dmamap_sync(sc->ti_cdata.ti_rx_std_tag, sc->ti_cdata.ti_rx_std_maps[i], BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->ti_cdata.ti_rx_std_tag, sc->ti_cdata.ti_rx_std_maps[i]); } map = sc->ti_cdata.ti_rx_std_maps[i]; sc->ti_cdata.ti_rx_std_maps[i] = sc->ti_cdata.ti_rx_std_sparemap; sc->ti_cdata.ti_rx_std_sparemap = map; sc->ti_cdata.ti_rx_std_chain[i] = m; r = &sc->ti_rdata.ti_rx_std_ring[i]; ti_hostaddr64(&r->ti_addr, segs[0].ds_addr); r->ti_len = segs[0].ds_len; r->ti_type = TI_BDTYPE_RECV_BD; r->ti_flags = 0; r->ti_vlan_tag = 0; r->ti_tcp_udp_cksum = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) r->ti_flags |= TI_BDFLAG_TCP_UDP_CKSUM | TI_BDFLAG_IP_CKSUM; r->ti_idx = i; bus_dmamap_sync(sc->ti_cdata.ti_rx_std_tag, sc->ti_cdata.ti_rx_std_maps[i], BUS_DMASYNC_PREREAD); return (0); } /* * Intialize a mini receive ring descriptor. This only applies to * the Tigon 2. */ static int ti_newbuf_mini(struct ti_softc *sc, int i) { bus_dmamap_t map; bus_dma_segment_t segs[1]; struct mbuf *m; struct ti_rx_desc *r; int error, nsegs; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_len = m->m_pkthdr.len = MHLEN; m_adj(m, ETHER_ALIGN); error = bus_dmamap_load_mbuf_sg(sc->ti_cdata.ti_rx_mini_tag, sc->ti_cdata.ti_rx_mini_sparemap, m, segs, &nsegs, 0); if (error != 0) { m_freem(m); return (error); } KASSERT(nsegs == 1, ("%s: %d segments returned!", __func__, nsegs)); if (sc->ti_cdata.ti_rx_mini_chain[i] != NULL) { bus_dmamap_sync(sc->ti_cdata.ti_rx_mini_tag, sc->ti_cdata.ti_rx_mini_maps[i], BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->ti_cdata.ti_rx_mini_tag, sc->ti_cdata.ti_rx_mini_maps[i]); } map = sc->ti_cdata.ti_rx_mini_maps[i]; sc->ti_cdata.ti_rx_mini_maps[i] = sc->ti_cdata.ti_rx_mini_sparemap; sc->ti_cdata.ti_rx_mini_sparemap = map; sc->ti_cdata.ti_rx_mini_chain[i] = m; r = &sc->ti_rdata.ti_rx_mini_ring[i]; ti_hostaddr64(&r->ti_addr, segs[0].ds_addr); r->ti_len = segs[0].ds_len; r->ti_type = TI_BDTYPE_RECV_BD; r->ti_flags = TI_BDFLAG_MINI_RING; r->ti_vlan_tag = 0; r->ti_tcp_udp_cksum = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) r->ti_flags |= TI_BDFLAG_TCP_UDP_CKSUM | TI_BDFLAG_IP_CKSUM; r->ti_idx = i; bus_dmamap_sync(sc->ti_cdata.ti_rx_mini_tag, sc->ti_cdata.ti_rx_mini_maps[i], BUS_DMASYNC_PREREAD); return (0); } #ifndef TI_SF_BUF_JUMBO /* * Initialize a jumbo receive ring descriptor. This allocates * a jumbo buffer from the pool managed internally by the driver. */ static int ti_newbuf_jumbo(struct ti_softc *sc, int i, struct mbuf *dummy) { bus_dmamap_t map; bus_dma_segment_t segs[1]; struct mbuf *m; struct ti_rx_desc *r; int error, nsegs; (void)dummy; m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, MJUM9BYTES); if (m == NULL) return (ENOBUFS); m->m_len = m->m_pkthdr.len = MJUM9BYTES; m_adj(m, ETHER_ALIGN); error = bus_dmamap_load_mbuf_sg(sc->ti_cdata.ti_rx_jumbo_tag, sc->ti_cdata.ti_rx_jumbo_sparemap, m, segs, &nsegs, 0); if (error != 0) { m_freem(m); return (error); } KASSERT(nsegs == 1, ("%s: %d segments returned!", __func__, nsegs)); if (sc->ti_cdata.ti_rx_jumbo_chain[i] != NULL) { bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_tag, sc->ti_cdata.ti_rx_jumbo_maps[i], BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->ti_cdata.ti_rx_jumbo_tag, sc->ti_cdata.ti_rx_jumbo_maps[i]); } map = sc->ti_cdata.ti_rx_jumbo_maps[i]; sc->ti_cdata.ti_rx_jumbo_maps[i] = sc->ti_cdata.ti_rx_jumbo_sparemap; sc->ti_cdata.ti_rx_jumbo_sparemap = map; sc->ti_cdata.ti_rx_jumbo_chain[i] = m; r = &sc->ti_rdata.ti_rx_jumbo_ring[i]; ti_hostaddr64(&r->ti_addr, segs[0].ds_addr); r->ti_len = segs[0].ds_len; r->ti_type = TI_BDTYPE_RECV_JUMBO_BD; r->ti_flags = TI_BDFLAG_JUMBO_RING; r->ti_vlan_tag = 0; r->ti_tcp_udp_cksum = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) r->ti_flags |= TI_BDFLAG_TCP_UDP_CKSUM | TI_BDFLAG_IP_CKSUM; r->ti_idx = i; bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_tag, sc->ti_cdata.ti_rx_jumbo_maps[i], BUS_DMASYNC_PREREAD); return (0); } #else #if (PAGE_SIZE == 4096) #define NPAYLOAD 2 #else #define NPAYLOAD 1 #endif #define TCP_HDR_LEN (52 + sizeof(struct ether_header)) #define UDP_HDR_LEN (28 + sizeof(struct ether_header)) #define NFS_HDR_LEN (UDP_HDR_LEN) static int HDR_LEN = TCP_HDR_LEN; /* * Initialize a jumbo receive ring descriptor. This allocates * a jumbo buffer from the pool managed internally by the driver. */ static int ti_newbuf_jumbo(struct ti_softc *sc, int idx, struct mbuf *m_old) { bus_dmamap_t map; struct mbuf *cur, *m_new = NULL; struct mbuf *m[3] = {NULL, NULL, NULL}; struct ti_rx_desc_ext *r; vm_page_t frame; /* 1 extra buf to make nobufs easy*/ struct sf_buf *sf[3] = {NULL, NULL, NULL}; int i; bus_dma_segment_t segs[4]; int nsegs; if (m_old != NULL) { m_new = m_old; cur = m_old->m_next; for (i = 0; i <= NPAYLOAD; i++){ m[i] = cur; cur = cur->m_next; } } else { /* Allocate the mbufs. */ MGETHDR(m_new, M_NOWAIT, MT_DATA); if (m_new == NULL) { device_printf(sc->ti_dev, "mbuf allocation failed " "-- packet dropped!\n"); goto nobufs; } MGET(m[NPAYLOAD], M_NOWAIT, MT_DATA); if (m[NPAYLOAD] == NULL) { device_printf(sc->ti_dev, "cluster mbuf allocation " "failed -- packet dropped!\n"); goto nobufs; } if (!(MCLGET(m[NPAYLOAD], M_NOWAIT))) { device_printf(sc->ti_dev, "mbuf allocation failed " "-- packet dropped!\n"); goto nobufs; } m[NPAYLOAD]->m_len = MCLBYTES; for (i = 0; i < NPAYLOAD; i++){ MGET(m[i], M_NOWAIT, MT_DATA); if (m[i] == NULL) { device_printf(sc->ti_dev, "mbuf allocation " "failed -- packet dropped!\n"); goto nobufs; } frame = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (frame == NULL) { device_printf(sc->ti_dev, "buffer allocation " "failed -- packet dropped!\n"); printf(" index %d page %d\n", idx, i); goto nobufs; } sf[i] = sf_buf_alloc(frame, SFB_NOWAIT); if (sf[i] == NULL) { vm_page_unwire(frame, PQ_INACTIVE); vm_page_free(frame); device_printf(sc->ti_dev, "buffer allocation " "failed -- packet dropped!\n"); printf(" index %d page %d\n", idx, i); goto nobufs; } } for (i = 0; i < NPAYLOAD; i++){ /* Attach the buffer to the mbuf. */ m[i]->m_data = (void *)sf_buf_kva(sf[i]); m[i]->m_len = PAGE_SIZE; MEXTADD(m[i], sf_buf_kva(sf[i]), PAGE_SIZE, - sf_buf_mext, (void*)sf_buf_kva(sf[i]), sf[i], + sf_mext_free, (void*)sf_buf_kva(sf[i]), sf[i], 0, EXT_DISPOSABLE); m[i]->m_next = m[i+1]; } /* link the buffers to the header */ m_new->m_next = m[0]; m_new->m_data += ETHER_ALIGN; if (sc->ti_hdrsplit) m_new->m_len = MHLEN - ETHER_ALIGN; else m_new->m_len = HDR_LEN; m_new->m_pkthdr.len = NPAYLOAD * PAGE_SIZE + m_new->m_len; } /* Set up the descriptor. */ r = &sc->ti_rdata.ti_rx_jumbo_ring[idx]; sc->ti_cdata.ti_rx_jumbo_chain[idx] = m_new; map = sc->ti_cdata.ti_rx_jumbo_maps[i]; if (bus_dmamap_load_mbuf_sg(sc->ti_cdata.ti_rx_jumbo_tag, map, m_new, segs, &nsegs, 0)) return (ENOBUFS); if ((nsegs < 1) || (nsegs > 4)) return (ENOBUFS); ti_hostaddr64(&r->ti_addr0, segs[0].ds_addr); r->ti_len0 = m_new->m_len; ti_hostaddr64(&r->ti_addr1, segs[1].ds_addr); r->ti_len1 = PAGE_SIZE; ti_hostaddr64(&r->ti_addr2, segs[2].ds_addr); r->ti_len2 = m[1]->m_ext.ext_size; /* could be PAGE_SIZE or MCLBYTES */ if (PAGE_SIZE == 4096) { ti_hostaddr64(&r->ti_addr3, segs[3].ds_addr); r->ti_len3 = MCLBYTES; } else { r->ti_len3 = 0; } r->ti_type = TI_BDTYPE_RECV_JUMBO_BD; r->ti_flags = TI_BDFLAG_JUMBO_RING|TI_RCB_FLAG_USE_EXT_RX_BD; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) r->ti_flags |= TI_BDFLAG_TCP_UDP_CKSUM|TI_BDFLAG_IP_CKSUM; r->ti_idx = idx; bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_tag, map, BUS_DMASYNC_PREREAD); return (0); nobufs: /* * Warning! : * This can only be called before the mbufs are strung together. * If the mbufs are strung together, m_freem() will free the chain, * so that the later mbufs will be freed multiple times. */ if (m_new) m_freem(m_new); for (i = 0; i < 3; i++) { if (m[i]) m_freem(m[i]); if (sf[i]) - sf_buf_mext((void *)sf_buf_kva(sf[i]), sf[i]); + sf_mext_free((void *)sf_buf_kva(sf[i]), sf[i]); } return (ENOBUFS); } #endif /* * The standard receive ring has 512 entries in it. At 2K per mbuf cluster, * that's 1MB or memory, which is a lot. For now, we fill only the first * 256 ring entries and hope that our CPU is fast enough to keep up with * the NIC. */ static int ti_init_rx_ring_std(struct ti_softc *sc) { int i; struct ti_cmd_desc cmd; for (i = 0; i < TI_STD_RX_RING_CNT; i++) { if (ti_newbuf_std(sc, i) != 0) return (ENOBUFS); }; sc->ti_std = TI_STD_RX_RING_CNT - 1; TI_UPDATE_STDPROD(sc, TI_STD_RX_RING_CNT - 1); return (0); } static void ti_free_rx_ring_std(struct ti_softc *sc) { bus_dmamap_t map; int i; for (i = 0; i < TI_STD_RX_RING_CNT; i++) { if (sc->ti_cdata.ti_rx_std_chain[i] != NULL) { map = sc->ti_cdata.ti_rx_std_maps[i]; bus_dmamap_sync(sc->ti_cdata.ti_rx_std_tag, map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->ti_cdata.ti_rx_std_tag, map); m_freem(sc->ti_cdata.ti_rx_std_chain[i]); sc->ti_cdata.ti_rx_std_chain[i] = NULL; } } bzero(sc->ti_rdata.ti_rx_std_ring, TI_STD_RX_RING_SZ); bus_dmamap_sync(sc->ti_cdata.ti_rx_std_ring_tag, sc->ti_cdata.ti_rx_std_ring_map, BUS_DMASYNC_PREWRITE); } static int ti_init_rx_ring_jumbo(struct ti_softc *sc) { struct ti_cmd_desc cmd; int i; for (i = 0; i < TI_JUMBO_RX_RING_CNT; i++) { if (ti_newbuf_jumbo(sc, i, NULL) != 0) return (ENOBUFS); }; sc->ti_jumbo = TI_JUMBO_RX_RING_CNT - 1; TI_UPDATE_JUMBOPROD(sc, TI_JUMBO_RX_RING_CNT - 1); return (0); } static void ti_free_rx_ring_jumbo(struct ti_softc *sc) { bus_dmamap_t map; int i; for (i = 0; i < TI_JUMBO_RX_RING_CNT; i++) { if (sc->ti_cdata.ti_rx_jumbo_chain[i] != NULL) { map = sc->ti_cdata.ti_rx_jumbo_maps[i]; bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_tag, map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->ti_cdata.ti_rx_jumbo_tag, map); m_freem(sc->ti_cdata.ti_rx_jumbo_chain[i]); sc->ti_cdata.ti_rx_jumbo_chain[i] = NULL; } } bzero(sc->ti_rdata.ti_rx_jumbo_ring, TI_JUMBO_RX_RING_SZ); bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_ring_tag, sc->ti_cdata.ti_rx_jumbo_ring_map, BUS_DMASYNC_PREWRITE); } static int ti_init_rx_ring_mini(struct ti_softc *sc) { int i; for (i = 0; i < TI_MINI_RX_RING_CNT; i++) { if (ti_newbuf_mini(sc, i) != 0) return (ENOBUFS); }; sc->ti_mini = TI_MINI_RX_RING_CNT - 1; TI_UPDATE_MINIPROD(sc, TI_MINI_RX_RING_CNT - 1); return (0); } static void ti_free_rx_ring_mini(struct ti_softc *sc) { bus_dmamap_t map; int i; if (sc->ti_rdata.ti_rx_mini_ring == NULL) return; for (i = 0; i < TI_MINI_RX_RING_CNT; i++) { if (sc->ti_cdata.ti_rx_mini_chain[i] != NULL) { map = sc->ti_cdata.ti_rx_mini_maps[i]; bus_dmamap_sync(sc->ti_cdata.ti_rx_mini_tag, map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->ti_cdata.ti_rx_mini_tag, map); m_freem(sc->ti_cdata.ti_rx_mini_chain[i]); sc->ti_cdata.ti_rx_mini_chain[i] = NULL; } } bzero(sc->ti_rdata.ti_rx_mini_ring, TI_MINI_RX_RING_SZ); bus_dmamap_sync(sc->ti_cdata.ti_rx_mini_ring_tag, sc->ti_cdata.ti_rx_mini_ring_map, BUS_DMASYNC_PREWRITE); } static void ti_free_tx_ring(struct ti_softc *sc) { struct ti_txdesc *txd; int i; if (sc->ti_rdata.ti_tx_ring == NULL) return; for (i = 0; i < TI_TX_RING_CNT; i++) { txd = &sc->ti_cdata.ti_txdesc[i]; if (txd->tx_m != NULL) { bus_dmamap_sync(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap); m_freem(txd->tx_m); txd->tx_m = NULL; } } bzero(sc->ti_rdata.ti_tx_ring, TI_TX_RING_SZ); bus_dmamap_sync(sc->ti_cdata.ti_tx_ring_tag, sc->ti_cdata.ti_tx_ring_map, BUS_DMASYNC_PREWRITE); } static int ti_init_tx_ring(struct ti_softc *sc) { struct ti_txdesc *txd; int i; STAILQ_INIT(&sc->ti_cdata.ti_txfreeq); STAILQ_INIT(&sc->ti_cdata.ti_txbusyq); for (i = 0; i < TI_TX_RING_CNT; i++) { txd = &sc->ti_cdata.ti_txdesc[i]; STAILQ_INSERT_TAIL(&sc->ti_cdata.ti_txfreeq, txd, tx_q); } sc->ti_txcnt = 0; sc->ti_tx_saved_considx = 0; sc->ti_tx_saved_prodidx = 0; CSR_WRITE_4(sc, TI_MB_SENDPROD_IDX, 0); return (0); } /* * The Tigon 2 firmware has a new way to add/delete multicast addresses, * but we have to support the old way too so that Tigon 1 cards will * work. */ static void ti_add_mcast(struct ti_softc *sc, struct ether_addr *addr) { struct ti_cmd_desc cmd; uint16_t *m; uint32_t ext[2] = {0, 0}; m = (uint16_t *)&addr->octet[0]; switch (sc->ti_hwrev) { case TI_HWREV_TIGON: CSR_WRITE_4(sc, TI_GCR_MAR0, htons(m[0])); CSR_WRITE_4(sc, TI_GCR_MAR1, (htons(m[1]) << 16) | htons(m[2])); TI_DO_CMD(TI_CMD_ADD_MCAST_ADDR, 0, 0); break; case TI_HWREV_TIGON_II: ext[0] = htons(m[0]); ext[1] = (htons(m[1]) << 16) | htons(m[2]); TI_DO_CMD_EXT(TI_CMD_EXT_ADD_MCAST, 0, 0, (caddr_t)&ext, 2); break; default: device_printf(sc->ti_dev, "unknown hwrev\n"); break; } } static void ti_del_mcast(struct ti_softc *sc, struct ether_addr *addr) { struct ti_cmd_desc cmd; uint16_t *m; uint32_t ext[2] = {0, 0}; m = (uint16_t *)&addr->octet[0]; switch (sc->ti_hwrev) { case TI_HWREV_TIGON: CSR_WRITE_4(sc, TI_GCR_MAR0, htons(m[0])); CSR_WRITE_4(sc, TI_GCR_MAR1, (htons(m[1]) << 16) | htons(m[2])); TI_DO_CMD(TI_CMD_DEL_MCAST_ADDR, 0, 0); break; case TI_HWREV_TIGON_II: ext[0] = htons(m[0]); ext[1] = (htons(m[1]) << 16) | htons(m[2]); TI_DO_CMD_EXT(TI_CMD_EXT_DEL_MCAST, 0, 0, (caddr_t)&ext, 2); break; default: device_printf(sc->ti_dev, "unknown hwrev\n"); break; } } /* * Configure the Tigon's multicast address filter. * * The actual multicast table management is a bit of a pain, thanks to * slight brain damage on the part of both Alteon and us. With our * multicast code, we are only alerted when the multicast address table * changes and at that point we only have the current list of addresses: * we only know the current state, not the previous state, so we don't * actually know what addresses were removed or added. The firmware has * state, but we can't get our grubby mits on it, and there is no 'delete * all multicast addresses' command. Hence, we have to maintain our own * state so we know what addresses have been programmed into the NIC at * any given time. */ static void ti_setmulti(struct ti_softc *sc) { struct ifnet *ifp; struct ifmultiaddr *ifma; struct ti_cmd_desc cmd; struct ti_mc_entry *mc; uint32_t intrs; TI_LOCK_ASSERT(sc); ifp = sc->ti_ifp; if (ifp->if_flags & IFF_ALLMULTI) { TI_DO_CMD(TI_CMD_SET_ALLMULTI, TI_CMD_CODE_ALLMULTI_ENB, 0); return; } else { TI_DO_CMD(TI_CMD_SET_ALLMULTI, TI_CMD_CODE_ALLMULTI_DIS, 0); } /* Disable interrupts. */ intrs = CSR_READ_4(sc, TI_MB_HOSTINTR); CSR_WRITE_4(sc, TI_MB_HOSTINTR, 1); /* First, zot all the existing filters. */ while (SLIST_FIRST(&sc->ti_mc_listhead) != NULL) { mc = SLIST_FIRST(&sc->ti_mc_listhead); ti_del_mcast(sc, &mc->mc_addr); SLIST_REMOVE_HEAD(&sc->ti_mc_listhead, mc_entries); free(mc, M_DEVBUF); } /* Now program new ones. */ if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct ti_mc_entry), M_DEVBUF, M_NOWAIT); if (mc == NULL) { device_printf(sc->ti_dev, "no memory for mcast filter entry\n"); continue; } bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), (char *)&mc->mc_addr, ETHER_ADDR_LEN); SLIST_INSERT_HEAD(&sc->ti_mc_listhead, mc, mc_entries); ti_add_mcast(sc, &mc->mc_addr); } if_maddr_runlock(ifp); /* Re-enable interrupts. */ CSR_WRITE_4(sc, TI_MB_HOSTINTR, intrs); } /* * Check to see if the BIOS has configured us for a 64 bit slot when * we aren't actually in one. If we detect this condition, we can work * around it on the Tigon 2 by setting a bit in the PCI state register, * but for the Tigon 1 we must give up and abort the interface attach. */ static int ti_64bitslot_war(struct ti_softc *sc) { if (!(CSR_READ_4(sc, TI_PCI_STATE) & TI_PCISTATE_32BIT_BUS)) { CSR_WRITE_4(sc, 0x600, 0); CSR_WRITE_4(sc, 0x604, 0); CSR_WRITE_4(sc, 0x600, 0x5555AAAA); if (CSR_READ_4(sc, 0x604) == 0x5555AAAA) { if (sc->ti_hwrev == TI_HWREV_TIGON) return (EINVAL); else { TI_SETBIT(sc, TI_PCI_STATE, TI_PCISTATE_32BIT_BUS); return (0); } } } return (0); } /* * Do endian, PCI and DMA initialization. Also check the on-board ROM * self-test results. */ static int ti_chipinit(struct ti_softc *sc) { uint32_t cacheline; uint32_t pci_writemax = 0; uint32_t hdrsplit; /* Initialize link to down state. */ sc->ti_linkstat = TI_EV_CODE_LINK_DOWN; /* Set endianness before we access any non-PCI registers. */ #if 0 && BYTE_ORDER == BIG_ENDIAN CSR_WRITE_4(sc, TI_MISC_HOST_CTL, TI_MHC_BIGENDIAN_INIT | (TI_MHC_BIGENDIAN_INIT << 24)); #else CSR_WRITE_4(sc, TI_MISC_HOST_CTL, TI_MHC_LITTLEENDIAN_INIT | (TI_MHC_LITTLEENDIAN_INIT << 24)); #endif /* Check the ROM failed bit to see if self-tests passed. */ if (CSR_READ_4(sc, TI_CPU_STATE) & TI_CPUSTATE_ROMFAIL) { device_printf(sc->ti_dev, "board self-diagnostics failed!\n"); return (ENODEV); } /* Halt the CPU. */ TI_SETBIT(sc, TI_CPU_STATE, TI_CPUSTATE_HALT); /* Figure out the hardware revision. */ switch (CSR_READ_4(sc, TI_MISC_HOST_CTL) & TI_MHC_CHIP_REV_MASK) { case TI_REV_TIGON_I: sc->ti_hwrev = TI_HWREV_TIGON; break; case TI_REV_TIGON_II: sc->ti_hwrev = TI_HWREV_TIGON_II; break; default: device_printf(sc->ti_dev, "unsupported chip revision\n"); return (ENODEV); } /* Do special setup for Tigon 2. */ if (sc->ti_hwrev == TI_HWREV_TIGON_II) { TI_SETBIT(sc, TI_CPU_CTL_B, TI_CPUSTATE_HALT); TI_SETBIT(sc, TI_MISC_LOCAL_CTL, TI_MLC_SRAM_BANK_512K); TI_SETBIT(sc, TI_MISC_CONF, TI_MCR_SRAM_SYNCHRONOUS); } /* * We don't have firmware source for the Tigon 1, so Tigon 1 boards * can't do header splitting. */ #ifdef TI_JUMBO_HDRSPLIT if (sc->ti_hwrev != TI_HWREV_TIGON) sc->ti_hdrsplit = 1; else device_printf(sc->ti_dev, "can't do header splitting on a Tigon I board\n"); #endif /* TI_JUMBO_HDRSPLIT */ /* Set up the PCI state register. */ CSR_WRITE_4(sc, TI_PCI_STATE, TI_PCI_READ_CMD|TI_PCI_WRITE_CMD); if (sc->ti_hwrev == TI_HWREV_TIGON_II) { TI_SETBIT(sc, TI_PCI_STATE, TI_PCISTATE_USE_MEM_RD_MULT); } /* Clear the read/write max DMA parameters. */ TI_CLRBIT(sc, TI_PCI_STATE, (TI_PCISTATE_WRITE_MAXDMA| TI_PCISTATE_READ_MAXDMA)); /* Get cache line size. */ cacheline = CSR_READ_4(sc, TI_PCI_BIST) & 0xFF; /* * If the system has set enabled the PCI memory write * and invalidate command in the command register, set * the write max parameter accordingly. This is necessary * to use MWI with the Tigon 2. */ if (CSR_READ_4(sc, TI_PCI_CMDSTAT) & PCIM_CMD_MWIEN) { switch (cacheline) { case 1: case 4: case 8: case 16: case 32: case 64: break; default: /* Disable PCI memory write and invalidate. */ if (bootverbose) device_printf(sc->ti_dev, "cache line size %d" " not supported; disabling PCI MWI\n", cacheline); CSR_WRITE_4(sc, TI_PCI_CMDSTAT, CSR_READ_4(sc, TI_PCI_CMDSTAT) & ~PCIM_CMD_MWIEN); break; } } TI_SETBIT(sc, TI_PCI_STATE, pci_writemax); /* This sets the min dma param all the way up (0xff). */ TI_SETBIT(sc, TI_PCI_STATE, TI_PCISTATE_MINDMA); if (sc->ti_hdrsplit) hdrsplit = TI_OPMODE_JUMBO_HDRSPLIT; else hdrsplit = 0; /* Configure DMA variables. */ #if BYTE_ORDER == BIG_ENDIAN CSR_WRITE_4(sc, TI_GCR_OPMODE, TI_OPMODE_BYTESWAP_BD | TI_OPMODE_BYTESWAP_DATA | TI_OPMODE_WORDSWAP_BD | TI_OPMODE_WARN_ENB | TI_OPMODE_FATAL_ENB | TI_OPMODE_DONT_FRAG_JUMBO | hdrsplit); #else /* BYTE_ORDER */ CSR_WRITE_4(sc, TI_GCR_OPMODE, TI_OPMODE_BYTESWAP_DATA| TI_OPMODE_WORDSWAP_BD|TI_OPMODE_DONT_FRAG_JUMBO| TI_OPMODE_WARN_ENB|TI_OPMODE_FATAL_ENB | hdrsplit); #endif /* BYTE_ORDER */ /* * Only allow 1 DMA channel to be active at a time. * I don't think this is a good idea, but without it * the firmware racks up lots of nicDmaReadRingFull * errors. This is not compatible with hardware checksums. */ if ((sc->ti_ifp->if_capenable & (IFCAP_TXCSUM | IFCAP_RXCSUM)) == 0) TI_SETBIT(sc, TI_GCR_OPMODE, TI_OPMODE_1_DMA_ACTIVE); /* Recommended settings from Tigon manual. */ CSR_WRITE_4(sc, TI_GCR_DMA_WRITECFG, TI_DMA_STATE_THRESH_8W); CSR_WRITE_4(sc, TI_GCR_DMA_READCFG, TI_DMA_STATE_THRESH_8W); if (ti_64bitslot_war(sc)) { device_printf(sc->ti_dev, "bios thinks we're in a 64 bit slot, " "but we aren't"); return (EINVAL); } return (0); } /* * Initialize the general information block and firmware, and * start the CPU(s) running. */ static int ti_gibinit(struct ti_softc *sc) { struct ifnet *ifp; struct ti_rcb *rcb; int i; TI_LOCK_ASSERT(sc); ifp = sc->ti_ifp; /* Disable interrupts for now. */ CSR_WRITE_4(sc, TI_MB_HOSTINTR, 1); /* Tell the chip where to find the general information block. */ CSR_WRITE_4(sc, TI_GCR_GENINFO_HI, (uint64_t)sc->ti_rdata.ti_info_paddr >> 32); CSR_WRITE_4(sc, TI_GCR_GENINFO_LO, sc->ti_rdata.ti_info_paddr & 0xFFFFFFFF); /* Load the firmware into SRAM. */ ti_loadfw(sc); /* Set up the contents of the general info and ring control blocks. */ /* Set up the event ring and producer pointer. */ bzero(sc->ti_rdata.ti_event_ring, TI_EVENT_RING_SZ); rcb = &sc->ti_rdata.ti_info->ti_ev_rcb; ti_hostaddr64(&rcb->ti_hostaddr, sc->ti_rdata.ti_event_ring_paddr); rcb->ti_flags = 0; ti_hostaddr64(&sc->ti_rdata.ti_info->ti_ev_prodidx_ptr, sc->ti_rdata.ti_status_paddr + offsetof(struct ti_status, ti_ev_prodidx_r)); sc->ti_ev_prodidx.ti_idx = 0; CSR_WRITE_4(sc, TI_GCR_EVENTCONS_IDX, 0); sc->ti_ev_saved_considx = 0; /* Set up the command ring and producer mailbox. */ rcb = &sc->ti_rdata.ti_info->ti_cmd_rcb; ti_hostaddr64(&rcb->ti_hostaddr, TI_GCR_NIC_ADDR(TI_GCR_CMDRING)); rcb->ti_flags = 0; rcb->ti_max_len = 0; for (i = 0; i < TI_CMD_RING_CNT; i++) { CSR_WRITE_4(sc, TI_GCR_CMDRING + (i * 4), 0); } CSR_WRITE_4(sc, TI_GCR_CMDCONS_IDX, 0); CSR_WRITE_4(sc, TI_MB_CMDPROD_IDX, 0); sc->ti_cmd_saved_prodidx = 0; /* * Assign the address of the stats refresh buffer. * We re-use the current stats buffer for this to * conserve memory. */ bzero(&sc->ti_rdata.ti_info->ti_stats, sizeof(struct ti_stats)); ti_hostaddr64(&sc->ti_rdata.ti_info->ti_refresh_stats_ptr, sc->ti_rdata.ti_info_paddr + offsetof(struct ti_gib, ti_stats)); /* Set up the standard receive ring. */ rcb = &sc->ti_rdata.ti_info->ti_std_rx_rcb; ti_hostaddr64(&rcb->ti_hostaddr, sc->ti_rdata.ti_rx_std_ring_paddr); rcb->ti_max_len = TI_FRAMELEN; rcb->ti_flags = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) rcb->ti_flags |= TI_RCB_FLAG_TCP_UDP_CKSUM | TI_RCB_FLAG_IP_CKSUM | TI_RCB_FLAG_NO_PHDR_CKSUM; if (sc->ti_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) rcb->ti_flags |= TI_RCB_FLAG_VLAN_ASSIST; /* Set up the jumbo receive ring. */ rcb = &sc->ti_rdata.ti_info->ti_jumbo_rx_rcb; ti_hostaddr64(&rcb->ti_hostaddr, sc->ti_rdata.ti_rx_jumbo_ring_paddr); #ifndef TI_SF_BUF_JUMBO rcb->ti_max_len = MJUM9BYTES - ETHER_ALIGN; rcb->ti_flags = 0; #else rcb->ti_max_len = PAGE_SIZE; rcb->ti_flags = TI_RCB_FLAG_USE_EXT_RX_BD; #endif if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) rcb->ti_flags |= TI_RCB_FLAG_TCP_UDP_CKSUM | TI_RCB_FLAG_IP_CKSUM | TI_RCB_FLAG_NO_PHDR_CKSUM; if (sc->ti_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) rcb->ti_flags |= TI_RCB_FLAG_VLAN_ASSIST; /* * Set up the mini ring. Only activated on the * Tigon 2 but the slot in the config block is * still there on the Tigon 1. */ rcb = &sc->ti_rdata.ti_info->ti_mini_rx_rcb; ti_hostaddr64(&rcb->ti_hostaddr, sc->ti_rdata.ti_rx_mini_ring_paddr); rcb->ti_max_len = MHLEN - ETHER_ALIGN; if (sc->ti_hwrev == TI_HWREV_TIGON) rcb->ti_flags = TI_RCB_FLAG_RING_DISABLED; else rcb->ti_flags = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) rcb->ti_flags |= TI_RCB_FLAG_TCP_UDP_CKSUM | TI_RCB_FLAG_IP_CKSUM | TI_RCB_FLAG_NO_PHDR_CKSUM; if (sc->ti_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) rcb->ti_flags |= TI_RCB_FLAG_VLAN_ASSIST; /* * Set up the receive return ring. */ rcb = &sc->ti_rdata.ti_info->ti_return_rcb; ti_hostaddr64(&rcb->ti_hostaddr, sc->ti_rdata.ti_rx_return_ring_paddr); rcb->ti_flags = 0; rcb->ti_max_len = TI_RETURN_RING_CNT; ti_hostaddr64(&sc->ti_rdata.ti_info->ti_return_prodidx_ptr, sc->ti_rdata.ti_status_paddr + offsetof(struct ti_status, ti_return_prodidx_r)); /* * Set up the tx ring. Note: for the Tigon 2, we have the option * of putting the transmit ring in the host's address space and * letting the chip DMA it instead of leaving the ring in the NIC's * memory and accessing it through the shared memory region. We * do this for the Tigon 2, but it doesn't work on the Tigon 1, * so we have to revert to the shared memory scheme if we detect * a Tigon 1 chip. */ CSR_WRITE_4(sc, TI_WINBASE, TI_TX_RING_BASE); if (sc->ti_rdata.ti_tx_ring != NULL) bzero(sc->ti_rdata.ti_tx_ring, TI_TX_RING_SZ); rcb = &sc->ti_rdata.ti_info->ti_tx_rcb; if (sc->ti_hwrev == TI_HWREV_TIGON) rcb->ti_flags = 0; else rcb->ti_flags = TI_RCB_FLAG_HOST_RING; if (sc->ti_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) rcb->ti_flags |= TI_RCB_FLAG_VLAN_ASSIST; if (sc->ti_ifp->if_capenable & IFCAP_TXCSUM) rcb->ti_flags |= TI_RCB_FLAG_TCP_UDP_CKSUM | TI_RCB_FLAG_IP_CKSUM | TI_RCB_FLAG_NO_PHDR_CKSUM; rcb->ti_max_len = TI_TX_RING_CNT; if (sc->ti_hwrev == TI_HWREV_TIGON) ti_hostaddr64(&rcb->ti_hostaddr, TI_TX_RING_BASE); else ti_hostaddr64(&rcb->ti_hostaddr, sc->ti_rdata.ti_tx_ring_paddr); ti_hostaddr64(&sc->ti_rdata.ti_info->ti_tx_considx_ptr, sc->ti_rdata.ti_status_paddr + offsetof(struct ti_status, ti_tx_considx_r)); bus_dmamap_sync(sc->ti_cdata.ti_gib_tag, sc->ti_cdata.ti_gib_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); bus_dmamap_sync(sc->ti_cdata.ti_status_tag, sc->ti_cdata.ti_status_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); bus_dmamap_sync(sc->ti_cdata.ti_event_ring_tag, sc->ti_cdata.ti_event_ring_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); if (sc->ti_rdata.ti_tx_ring != NULL) bus_dmamap_sync(sc->ti_cdata.ti_tx_ring_tag, sc->ti_cdata.ti_tx_ring_map, BUS_DMASYNC_PREWRITE); /* Set up tunables */ #if 0 if (ifp->if_mtu > ETHERMTU + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN) CSR_WRITE_4(sc, TI_GCR_RX_COAL_TICKS, (sc->ti_rx_coal_ticks / 10)); else #endif CSR_WRITE_4(sc, TI_GCR_RX_COAL_TICKS, sc->ti_rx_coal_ticks); CSR_WRITE_4(sc, TI_GCR_TX_COAL_TICKS, sc->ti_tx_coal_ticks); CSR_WRITE_4(sc, TI_GCR_STAT_TICKS, sc->ti_stat_ticks); CSR_WRITE_4(sc, TI_GCR_RX_MAX_COAL_BD, sc->ti_rx_max_coal_bds); CSR_WRITE_4(sc, TI_GCR_TX_MAX_COAL_BD, sc->ti_tx_max_coal_bds); CSR_WRITE_4(sc, TI_GCR_TX_BUFFER_RATIO, sc->ti_tx_buf_ratio); /* Turn interrupts on. */ CSR_WRITE_4(sc, TI_GCR_MASK_INTRS, 0); CSR_WRITE_4(sc, TI_MB_HOSTINTR, 0); /* Start CPU. */ TI_CLRBIT(sc, TI_CPU_STATE, (TI_CPUSTATE_HALT|TI_CPUSTATE_STEP)); return (0); } /* * Probe for a Tigon chip. Check the PCI vendor and device IDs * against our list and return its name if we find a match. */ static int ti_probe(device_t dev) { const struct ti_type *t; t = ti_devs; while (t->ti_name != NULL) { if ((pci_get_vendor(dev) == t->ti_vid) && (pci_get_device(dev) == t->ti_did)) { device_set_desc(dev, t->ti_name); return (BUS_PROBE_DEFAULT); } t++; } return (ENXIO); } static int ti_attach(device_t dev) { struct ifnet *ifp; struct ti_softc *sc; int error = 0, rid; u_char eaddr[6]; sc = device_get_softc(dev); sc->ti_dev = dev; mtx_init(&sc->ti_mtx, device_get_nameunit(dev), MTX_NETWORK_LOCK, MTX_DEF); callout_init_mtx(&sc->ti_watchdog, &sc->ti_mtx, 0); ifmedia_init(&sc->ifmedia, IFM_IMASK, ti_ifmedia_upd, ti_ifmedia_sts); ifp = sc->ti_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not if_alloc()\n"); error = ENOSPC; goto fail; } sc->ti_ifp->if_hwassist = TI_CSUM_FEATURES; sc->ti_ifp->if_capabilities = IFCAP_TXCSUM | IFCAP_RXCSUM; sc->ti_ifp->if_capenable = sc->ti_ifp->if_capabilities; /* * Map control/status registers. */ pci_enable_busmaster(dev); rid = PCIR_BAR(0); sc->ti_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (sc->ti_res == NULL) { device_printf(dev, "couldn't map memory\n"); error = ENXIO; goto fail; } sc->ti_btag = rman_get_bustag(sc->ti_res); sc->ti_bhandle = rman_get_bushandle(sc->ti_res); /* Allocate interrupt */ rid = 0; sc->ti_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (sc->ti_irq == NULL) { device_printf(dev, "couldn't map interrupt\n"); error = ENXIO; goto fail; } if (ti_chipinit(sc)) { device_printf(dev, "chip initialization failed\n"); error = ENXIO; goto fail; } /* Zero out the NIC's on-board SRAM. */ ti_mem_zero(sc, 0x2000, 0x100000 - 0x2000); /* Init again -- zeroing memory may have clobbered some registers. */ if (ti_chipinit(sc)) { device_printf(dev, "chip initialization failed\n"); error = ENXIO; goto fail; } /* * Get station address from the EEPROM. Note: the manual states * that the MAC address is at offset 0x8c, however the data is * stored as two longwords (since that's how it's loaded into * the NIC). This means the MAC address is actually preceded * by two zero bytes. We need to skip over those. */ if (ti_read_eeprom(sc, eaddr, TI_EE_MAC_OFFSET + 2, ETHER_ADDR_LEN)) { device_printf(dev, "failed to read station address\n"); error = ENXIO; goto fail; } /* Allocate working area for memory dump. */ sc->ti_membuf = malloc(sizeof(uint8_t) * TI_WINLEN, M_DEVBUF, M_NOWAIT); sc->ti_membuf2 = malloc(sizeof(uint8_t) * TI_WINLEN, M_DEVBUF, M_NOWAIT); if (sc->ti_membuf == NULL || sc->ti_membuf2 == NULL) { device_printf(dev, "cannot allocate memory buffer\n"); error = ENOMEM; goto fail; } if ((error = ti_dma_alloc(sc)) != 0) goto fail; /* * We really need a better way to tell a 1000baseTX card * from a 1000baseSX one, since in theory there could be * OEMed 1000baseTX cards from lame vendors who aren't * clever enough to change the PCI ID. For the moment * though, the AceNIC is the only copper card available. */ if (pci_get_vendor(dev) == ALT_VENDORID && pci_get_device(dev) == ALT_DEVICEID_ACENIC_COPPER) sc->ti_copper = 1; /* Ok, it's not the only copper card available. */ if (pci_get_vendor(dev) == NG_VENDORID && pci_get_device(dev) == NG_DEVICEID_GA620T) sc->ti_copper = 1; /* Set default tunable values. */ ti_sysctl_node(sc); /* Set up ifnet structure */ ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = ti_ioctl; ifp->if_start = ti_start; ifp->if_init = ti_init; ifp->if_get_counter = ti_get_counter; ifp->if_baudrate = IF_Gbps(1UL); ifp->if_snd.ifq_drv_maxlen = TI_TX_RING_CNT - 1; IFQ_SET_MAXLEN(&ifp->if_snd, ifp->if_snd.ifq_drv_maxlen); IFQ_SET_READY(&ifp->if_snd); /* Set up ifmedia support. */ if (sc->ti_copper) { /* * Copper cards allow manual 10/100 mode selection, * but not manual 1000baseTX mode selection. Why? * Becuase currently there's no way to specify the * master/slave setting through the firmware interface, * so Alteon decided to just bag it and handle it * via autonegotiation. */ ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_10_T, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_10_T|IFM_FDX, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_100_TX, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_100_TX|IFM_FDX, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_1000_T, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_1000_T|IFM_FDX, 0, NULL); } else { /* Fiber cards don't support 10/100 modes. */ ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_1000_SX, 0, NULL); ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_1000_SX|IFM_FDX, 0, NULL); } ifmedia_add(&sc->ifmedia, IFM_ETHER|IFM_AUTO, 0, NULL); ifmedia_set(&sc->ifmedia, IFM_ETHER|IFM_AUTO); /* * We're assuming here that card initialization is a sequential * thing. If it isn't, multiple cards probing at the same time * could stomp on the list of softcs here. */ /* Register the device */ sc->dev = make_dev(&ti_cdevsw, device_get_unit(dev), UID_ROOT, GID_OPERATOR, 0600, "ti%d", device_get_unit(dev)); sc->dev->si_drv1 = sc; /* * Call MI attach routine. */ ether_ifattach(ifp, eaddr); /* VLAN capability setup. */ ifp->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWTAGGING; ifp->if_capenable = ifp->if_capabilities; /* Tell the upper layer we support VLAN over-sized frames. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); /* Driver supports link state tracking. */ ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; /* Hook interrupt last to avoid having to lock softc */ error = bus_setup_intr(dev, sc->ti_irq, INTR_TYPE_NET|INTR_MPSAFE, NULL, ti_intr, sc, &sc->ti_intrhand); if (error) { device_printf(dev, "couldn't set up irq\n"); goto fail; } fail: if (error) ti_detach(dev); return (error); } /* * Shutdown hardware and free up resources. This can be called any * time after the mutex has been initialized. It is called in both * the error case in attach and the normal detach case so it needs * to be careful about only freeing resources that have actually been * allocated. */ static int ti_detach(device_t dev) { struct ti_softc *sc; struct ifnet *ifp; sc = device_get_softc(dev); if (sc->dev) destroy_dev(sc->dev); KASSERT(mtx_initialized(&sc->ti_mtx), ("ti mutex not initialized")); ifp = sc->ti_ifp; if (device_is_attached(dev)) { ether_ifdetach(ifp); TI_LOCK(sc); ti_stop(sc); TI_UNLOCK(sc); } /* These should only be active if attach succeeded */ callout_drain(&sc->ti_watchdog); bus_generic_detach(dev); ti_dma_free(sc); ifmedia_removeall(&sc->ifmedia); if (sc->ti_intrhand) bus_teardown_intr(dev, sc->ti_irq, sc->ti_intrhand); if (sc->ti_irq) bus_release_resource(dev, SYS_RES_IRQ, 0, sc->ti_irq); if (sc->ti_res) { bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0), sc->ti_res); } if (ifp) if_free(ifp); if (sc->ti_membuf) free(sc->ti_membuf, M_DEVBUF); if (sc->ti_membuf2) free(sc->ti_membuf2, M_DEVBUF); mtx_destroy(&sc->ti_mtx); return (0); } #ifdef TI_JUMBO_HDRSPLIT /* * If hdr_len is 0, that means that header splitting wasn't done on * this packet for some reason. The two most likely reasons are that * the protocol isn't a supported protocol for splitting, or this * packet had a fragment offset that wasn't 0. * * The header length, if it is non-zero, will always be the length of * the headers on the packet, but that length could be longer than the * first mbuf. So we take the minimum of the two as the actual * length. */ static __inline void ti_hdr_split(struct mbuf *top, int hdr_len, int pkt_len, int idx) { int i = 0; int lengths[4] = {0, 0, 0, 0}; struct mbuf *m, *mp; if (hdr_len != 0) top->m_len = min(hdr_len, top->m_len); pkt_len -= top->m_len; lengths[i++] = top->m_len; mp = top; for (m = top->m_next; m && pkt_len; m = m->m_next) { m->m_len = m->m_ext.ext_size = min(m->m_len, pkt_len); pkt_len -= m->m_len; lengths[i++] = m->m_len; mp = m; } #if 0 if (hdr_len != 0) printf("got split packet: "); else printf("got non-split packet: "); printf("%d,%d,%d,%d = %d\n", lengths[0], lengths[1], lengths[2], lengths[3], lengths[0] + lengths[1] + lengths[2] + lengths[3]); #endif if (pkt_len) panic("header splitting didn't"); if (m) { m_freem(m); mp->m_next = NULL; } if (mp->m_next != NULL) panic("ti_hdr_split: last mbuf in chain should be null"); } #endif /* TI_JUMBO_HDRSPLIT */ static void ti_discard_std(struct ti_softc *sc, int i) { struct ti_rx_desc *r; r = &sc->ti_rdata.ti_rx_std_ring[i]; r->ti_len = MCLBYTES - ETHER_ALIGN; r->ti_type = TI_BDTYPE_RECV_BD; r->ti_flags = 0; r->ti_vlan_tag = 0; r->ti_tcp_udp_cksum = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) r->ti_flags |= TI_BDFLAG_TCP_UDP_CKSUM | TI_BDFLAG_IP_CKSUM; r->ti_idx = i; } static void ti_discard_mini(struct ti_softc *sc, int i) { struct ti_rx_desc *r; r = &sc->ti_rdata.ti_rx_mini_ring[i]; r->ti_len = MHLEN - ETHER_ALIGN; r->ti_type = TI_BDTYPE_RECV_BD; r->ti_flags = TI_BDFLAG_MINI_RING; r->ti_vlan_tag = 0; r->ti_tcp_udp_cksum = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) r->ti_flags |= TI_BDFLAG_TCP_UDP_CKSUM | TI_BDFLAG_IP_CKSUM; r->ti_idx = i; } #ifndef TI_SF_BUF_JUMBO static void ti_discard_jumbo(struct ti_softc *sc, int i) { struct ti_rx_desc *r; r = &sc->ti_rdata.ti_rx_jumbo_ring[i]; r->ti_len = MJUM9BYTES - ETHER_ALIGN; r->ti_type = TI_BDTYPE_RECV_JUMBO_BD; r->ti_flags = TI_BDFLAG_JUMBO_RING; r->ti_vlan_tag = 0; r->ti_tcp_udp_cksum = 0; if (sc->ti_ifp->if_capenable & IFCAP_RXCSUM) r->ti_flags |= TI_BDFLAG_TCP_UDP_CKSUM | TI_BDFLAG_IP_CKSUM; r->ti_idx = i; } #endif /* * Frame reception handling. This is called if there's a frame * on the receive return list. * * Note: we have to be able to handle three possibilities here: * 1) the frame is from the mini receive ring (can only happen) * on Tigon 2 boards) * 2) the frame is from the jumbo recieve ring * 3) the frame is from the standard receive ring */ static void ti_rxeof(struct ti_softc *sc) { struct ifnet *ifp; #ifdef TI_SF_BUF_JUMBO bus_dmamap_t map; #endif struct ti_cmd_desc cmd; int jumbocnt, minicnt, stdcnt, ti_len; TI_LOCK_ASSERT(sc); ifp = sc->ti_ifp; bus_dmamap_sync(sc->ti_cdata.ti_rx_std_ring_tag, sc->ti_cdata.ti_rx_std_ring_map, BUS_DMASYNC_POSTWRITE); if (ifp->if_mtu > ETHERMTU + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN) bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_ring_tag, sc->ti_cdata.ti_rx_jumbo_ring_map, BUS_DMASYNC_POSTWRITE); if (sc->ti_rdata.ti_rx_mini_ring != NULL) bus_dmamap_sync(sc->ti_cdata.ti_rx_mini_ring_tag, sc->ti_cdata.ti_rx_mini_ring_map, BUS_DMASYNC_POSTWRITE); bus_dmamap_sync(sc->ti_cdata.ti_rx_return_ring_tag, sc->ti_cdata.ti_rx_return_ring_map, BUS_DMASYNC_POSTREAD); jumbocnt = minicnt = stdcnt = 0; while (sc->ti_rx_saved_considx != sc->ti_return_prodidx.ti_idx) { struct ti_rx_desc *cur_rx; uint32_t rxidx; struct mbuf *m = NULL; uint16_t vlan_tag = 0; int have_tag = 0; cur_rx = &sc->ti_rdata.ti_rx_return_ring[sc->ti_rx_saved_considx]; rxidx = cur_rx->ti_idx; ti_len = cur_rx->ti_len; TI_INC(sc->ti_rx_saved_considx, TI_RETURN_RING_CNT); if (cur_rx->ti_flags & TI_BDFLAG_VLAN_TAG) { have_tag = 1; vlan_tag = cur_rx->ti_vlan_tag; } if (cur_rx->ti_flags & TI_BDFLAG_JUMBO_RING) { jumbocnt++; TI_INC(sc->ti_jumbo, TI_JUMBO_RX_RING_CNT); m = sc->ti_cdata.ti_rx_jumbo_chain[rxidx]; #ifndef TI_SF_BUF_JUMBO if (cur_rx->ti_flags & TI_BDFLAG_ERROR) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); ti_discard_jumbo(sc, rxidx); continue; } if (ti_newbuf_jumbo(sc, rxidx, NULL) != 0) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); ti_discard_jumbo(sc, rxidx); continue; } m->m_len = ti_len; #else /* !TI_SF_BUF_JUMBO */ sc->ti_cdata.ti_rx_jumbo_chain[rxidx] = NULL; map = sc->ti_cdata.ti_rx_jumbo_maps[rxidx]; bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_tag, map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->ti_cdata.ti_rx_jumbo_tag, map); if (cur_rx->ti_flags & TI_BDFLAG_ERROR) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); ti_newbuf_jumbo(sc, sc->ti_jumbo, m); continue; } if (ti_newbuf_jumbo(sc, sc->ti_jumbo, NULL) == ENOBUFS) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); ti_newbuf_jumbo(sc, sc->ti_jumbo, m); continue; } #ifdef TI_JUMBO_HDRSPLIT if (sc->ti_hdrsplit) ti_hdr_split(m, TI_HOSTADDR(cur_rx->ti_addr), ti_len, rxidx); else #endif /* TI_JUMBO_HDRSPLIT */ m_adj(m, ti_len - m->m_pkthdr.len); #endif /* TI_SF_BUF_JUMBO */ } else if (cur_rx->ti_flags & TI_BDFLAG_MINI_RING) { minicnt++; TI_INC(sc->ti_mini, TI_MINI_RX_RING_CNT); m = sc->ti_cdata.ti_rx_mini_chain[rxidx]; if (cur_rx->ti_flags & TI_BDFLAG_ERROR) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); ti_discard_mini(sc, rxidx); continue; } if (ti_newbuf_mini(sc, rxidx) != 0) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); ti_discard_mini(sc, rxidx); continue; } m->m_len = ti_len; } else { stdcnt++; TI_INC(sc->ti_std, TI_STD_RX_RING_CNT); m = sc->ti_cdata.ti_rx_std_chain[rxidx]; if (cur_rx->ti_flags & TI_BDFLAG_ERROR) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); ti_discard_std(sc, rxidx); continue; } if (ti_newbuf_std(sc, rxidx) != 0) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); ti_discard_std(sc, rxidx); continue; } m->m_len = ti_len; } m->m_pkthdr.len = ti_len; if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); m->m_pkthdr.rcvif = ifp; if (ifp->if_capenable & IFCAP_RXCSUM) { if (cur_rx->ti_flags & TI_BDFLAG_IP_CKSUM) { m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED; if ((cur_rx->ti_ip_cksum ^ 0xffff) == 0) m->m_pkthdr.csum_flags |= CSUM_IP_VALID; } if (cur_rx->ti_flags & TI_BDFLAG_TCP_UDP_CKSUM) { m->m_pkthdr.csum_data = cur_rx->ti_tcp_udp_cksum; m->m_pkthdr.csum_flags |= CSUM_DATA_VALID; } } /* * If we received a packet with a vlan tag, * tag it before passing the packet upward. */ if (have_tag) { m->m_pkthdr.ether_vtag = vlan_tag; m->m_flags |= M_VLANTAG; } TI_UNLOCK(sc); (*ifp->if_input)(ifp, m); TI_LOCK(sc); } bus_dmamap_sync(sc->ti_cdata.ti_rx_return_ring_tag, sc->ti_cdata.ti_rx_return_ring_map, BUS_DMASYNC_PREREAD); /* Only necessary on the Tigon 1. */ if (sc->ti_hwrev == TI_HWREV_TIGON) CSR_WRITE_4(sc, TI_GCR_RXRETURNCONS_IDX, sc->ti_rx_saved_considx); if (stdcnt > 0) { bus_dmamap_sync(sc->ti_cdata.ti_rx_std_ring_tag, sc->ti_cdata.ti_rx_std_ring_map, BUS_DMASYNC_PREWRITE); TI_UPDATE_STDPROD(sc, sc->ti_std); } if (minicnt > 0) { bus_dmamap_sync(sc->ti_cdata.ti_rx_mini_ring_tag, sc->ti_cdata.ti_rx_mini_ring_map, BUS_DMASYNC_PREWRITE); TI_UPDATE_MINIPROD(sc, sc->ti_mini); } if (jumbocnt > 0) { bus_dmamap_sync(sc->ti_cdata.ti_rx_jumbo_ring_tag, sc->ti_cdata.ti_rx_jumbo_ring_map, BUS_DMASYNC_PREWRITE); TI_UPDATE_JUMBOPROD(sc, sc->ti_jumbo); } } static void ti_txeof(struct ti_softc *sc) { struct ti_txdesc *txd; struct ti_tx_desc txdesc; struct ti_tx_desc *cur_tx = NULL; struct ifnet *ifp; int idx; ifp = sc->ti_ifp; txd = STAILQ_FIRST(&sc->ti_cdata.ti_txbusyq); if (txd == NULL) return; if (sc->ti_rdata.ti_tx_ring != NULL) bus_dmamap_sync(sc->ti_cdata.ti_tx_ring_tag, sc->ti_cdata.ti_tx_ring_map, BUS_DMASYNC_POSTWRITE); /* * Go through our tx ring and free mbufs for those * frames that have been sent. */ for (idx = sc->ti_tx_saved_considx; idx != sc->ti_tx_considx.ti_idx; TI_INC(idx, TI_TX_RING_CNT)) { if (sc->ti_hwrev == TI_HWREV_TIGON) { ti_mem_read(sc, TI_TX_RING_BASE + idx * sizeof(txdesc), sizeof(txdesc), &txdesc); cur_tx = &txdesc; } else cur_tx = &sc->ti_rdata.ti_tx_ring[idx]; sc->ti_txcnt--; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if ((cur_tx->ti_flags & TI_BDFLAG_END) == 0) continue; bus_dmamap_sync(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); m_freem(txd->tx_m); txd->tx_m = NULL; STAILQ_REMOVE_HEAD(&sc->ti_cdata.ti_txbusyq, tx_q); STAILQ_INSERT_TAIL(&sc->ti_cdata.ti_txfreeq, txd, tx_q); txd = STAILQ_FIRST(&sc->ti_cdata.ti_txbusyq); } sc->ti_tx_saved_considx = idx; if (sc->ti_txcnt == 0) sc->ti_timer = 0; } static void ti_intr(void *xsc) { struct ti_softc *sc; struct ifnet *ifp; sc = xsc; TI_LOCK(sc); ifp = sc->ti_ifp; /* Make sure this is really our interrupt. */ if (!(CSR_READ_4(sc, TI_MISC_HOST_CTL) & TI_MHC_INTSTATE)) { TI_UNLOCK(sc); return; } /* Ack interrupt and stop others from occuring. */ CSR_WRITE_4(sc, TI_MB_HOSTINTR, 1); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { bus_dmamap_sync(sc->ti_cdata.ti_status_tag, sc->ti_cdata.ti_status_map, BUS_DMASYNC_POSTREAD); /* Check RX return ring producer/consumer */ ti_rxeof(sc); /* Check TX ring producer/consumer */ ti_txeof(sc); bus_dmamap_sync(sc->ti_cdata.ti_status_tag, sc->ti_cdata.ti_status_map, BUS_DMASYNC_PREREAD); } ti_handle_events(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { /* Re-enable interrupts. */ CSR_WRITE_4(sc, TI_MB_HOSTINTR, 0); if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) ti_start_locked(ifp); } TI_UNLOCK(sc); } static uint64_t ti_get_counter(struct ifnet *ifp, ift_counter cnt) { switch (cnt) { case IFCOUNTER_COLLISIONS: { struct ti_softc *sc; struct ti_stats *s; uint64_t rv; sc = if_getsoftc(ifp); s = &sc->ti_rdata.ti_info->ti_stats; TI_LOCK(sc); bus_dmamap_sync(sc->ti_cdata.ti_gib_tag, sc->ti_cdata.ti_gib_map, BUS_DMASYNC_POSTREAD); rv = s->dot3StatsSingleCollisionFrames + s->dot3StatsMultipleCollisionFrames + s->dot3StatsExcessiveCollisions + s->dot3StatsLateCollisions; bus_dmamap_sync(sc->ti_cdata.ti_gib_tag, sc->ti_cdata.ti_gib_map, BUS_DMASYNC_PREREAD); TI_UNLOCK(sc); return (rv); } default: return (if_get_counter_default(ifp, cnt)); } } /* * Encapsulate an mbuf chain in the tx ring by coupling the mbuf data * pointers to descriptors. */ static int ti_encap(struct ti_softc *sc, struct mbuf **m_head) { struct ti_txdesc *txd; struct ti_tx_desc *f; struct ti_tx_desc txdesc; struct mbuf *m; bus_dma_segment_t txsegs[TI_MAXTXSEGS]; uint16_t csum_flags; int error, frag, i, nseg; if ((txd = STAILQ_FIRST(&sc->ti_cdata.ti_txfreeq)) == NULL) return (ENOBUFS); error = bus_dmamap_load_mbuf_sg(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap, *m_head, txsegs, &nseg, 0); if (error == EFBIG) { m = m_defrag(*m_head, M_NOWAIT); if (m == NULL) { m_freem(*m_head); *m_head = NULL; return (ENOMEM); } *m_head = m; error = bus_dmamap_load_mbuf_sg(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap, *m_head, txsegs, &nseg, 0); if (error) { m_freem(*m_head); *m_head = NULL; return (error); } } else if (error != 0) return (error); if (nseg == 0) { m_freem(*m_head); *m_head = NULL; return (EIO); } if (sc->ti_txcnt + nseg >= TI_TX_RING_CNT) { bus_dmamap_unload(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap); return (ENOBUFS); } bus_dmamap_sync(sc->ti_cdata.ti_tx_tag, txd->tx_dmamap, BUS_DMASYNC_PREWRITE); m = *m_head; csum_flags = 0; if (m->m_pkthdr.csum_flags & CSUM_IP) csum_flags |= TI_BDFLAG_IP_CKSUM; if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP)) csum_flags |= TI_BDFLAG_TCP_UDP_CKSUM; frag = sc->ti_tx_saved_prodidx; for (i = 0; i < nseg; i++) { if (sc->ti_hwrev == TI_HWREV_TIGON) { bzero(&txdesc, sizeof(txdesc)); f = &txdesc; } else f = &sc->ti_rdata.ti_tx_ring[frag]; ti_hostaddr64(&f->ti_addr, txsegs[i].ds_addr); f->ti_len = txsegs[i].ds_len; f->ti_flags = csum_flags; if (m->m_flags & M_VLANTAG) { f->ti_flags |= TI_BDFLAG_VLAN_TAG; f->ti_vlan_tag = m->m_pkthdr.ether_vtag; } else { f->ti_vlan_tag = 0; } if (sc->ti_hwrev == TI_HWREV_TIGON) ti_mem_write(sc, TI_TX_RING_BASE + frag * sizeof(txdesc), sizeof(txdesc), &txdesc); TI_INC(frag, TI_TX_RING_CNT); } sc->ti_tx_saved_prodidx = frag; /* set TI_BDFLAG_END on the last descriptor */ frag = (frag + TI_TX_RING_CNT - 1) % TI_TX_RING_CNT; if (sc->ti_hwrev == TI_HWREV_TIGON) { txdesc.ti_flags |= TI_BDFLAG_END; ti_mem_write(sc, TI_TX_RING_BASE + frag * sizeof(txdesc), sizeof(txdesc), &txdesc); } else sc->ti_rdata.ti_tx_ring[frag].ti_flags |= TI_BDFLAG_END; STAILQ_REMOVE_HEAD(&sc->ti_cdata.ti_txfreeq, tx_q); STAILQ_INSERT_TAIL(&sc->ti_cdata.ti_txbusyq, txd, tx_q); txd->tx_m = m; sc->ti_txcnt += nseg; return (0); } static void ti_start(struct ifnet *ifp) { struct ti_softc *sc; sc = ifp->if_softc; TI_LOCK(sc); ti_start_locked(ifp); TI_UNLOCK(sc); } /* * Main transmit routine. To avoid having to do mbuf copies, we put pointers * to the mbuf data regions directly in the transmit descriptors. */ static void ti_start_locked(struct ifnet *ifp) { struct ti_softc *sc; struct mbuf *m_head = NULL; int enq = 0; sc = ifp->if_softc; for (; !IFQ_DRV_IS_EMPTY(&ifp->if_snd) && sc->ti_txcnt < (TI_TX_RING_CNT - 16);) { IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; /* * Pack the data into the transmit ring. If we * don't have room, set the OACTIVE flag and wait * for the NIC to drain the ring. */ if (ti_encap(sc, &m_head)) { if (m_head == NULL) break; IFQ_DRV_PREPEND(&ifp->if_snd, m_head); ifp->if_drv_flags |= IFF_DRV_OACTIVE; break; } enq++; /* * If there's a BPF listener, bounce a copy of this frame * to him. */ ETHER_BPF_MTAP(ifp, m_head); } if (enq > 0) { if (sc->ti_rdata.ti_tx_ring != NULL) bus_dmamap_sync(sc->ti_cdata.ti_tx_ring_tag, sc->ti_cdata.ti_tx_ring_map, BUS_DMASYNC_PREWRITE); /* Transmit */ CSR_WRITE_4(sc, TI_MB_SENDPROD_IDX, sc->ti_tx_saved_prodidx); /* * Set a timeout in case the chip goes out to lunch. */ sc->ti_timer = 5; } } static void ti_init(void *xsc) { struct ti_softc *sc; sc = xsc; TI_LOCK(sc); ti_init_locked(sc); TI_UNLOCK(sc); } static void ti_init_locked(void *xsc) { struct ti_softc *sc = xsc; if (sc->ti_ifp->if_drv_flags & IFF_DRV_RUNNING) return; /* Cancel pending I/O and flush buffers. */ ti_stop(sc); /* Init the gen info block, ring control blocks and firmware. */ if (ti_gibinit(sc)) { device_printf(sc->ti_dev, "initialization failure\n"); return; } } static void ti_init2(struct ti_softc *sc) { struct ti_cmd_desc cmd; struct ifnet *ifp; uint8_t *ea; struct ifmedia *ifm; int tmp; TI_LOCK_ASSERT(sc); ifp = sc->ti_ifp; /* Specify MTU and interface index. */ CSR_WRITE_4(sc, TI_GCR_IFINDEX, device_get_unit(sc->ti_dev)); CSR_WRITE_4(sc, TI_GCR_IFMTU, ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + ETHER_VLAN_ENCAP_LEN); TI_DO_CMD(TI_CMD_UPDATE_GENCOM, 0, 0); /* Load our MAC address. */ ea = IF_LLADDR(sc->ti_ifp); CSR_WRITE_4(sc, TI_GCR_PAR0, (ea[0] << 8) | ea[1]); CSR_WRITE_4(sc, TI_GCR_PAR1, (ea[2] << 24) | (ea[3] << 16) | (ea[4] << 8) | ea[5]); TI_DO_CMD(TI_CMD_SET_MAC_ADDR, 0, 0); /* Enable or disable promiscuous mode as needed. */ if (ifp->if_flags & IFF_PROMISC) { TI_DO_CMD(TI_CMD_SET_PROMISC_MODE, TI_CMD_CODE_PROMISC_ENB, 0); } else { TI_DO_CMD(TI_CMD_SET_PROMISC_MODE, TI_CMD_CODE_PROMISC_DIS, 0); } /* Program multicast filter. */ ti_setmulti(sc); /* * If this is a Tigon 1, we should tell the * firmware to use software packet filtering. */ if (sc->ti_hwrev == TI_HWREV_TIGON) { TI_DO_CMD(TI_CMD_FDR_FILTERING, TI_CMD_CODE_FILT_ENB, 0); } /* Init RX ring. */ if (ti_init_rx_ring_std(sc) != 0) { /* XXX */ device_printf(sc->ti_dev, "no memory for std Rx buffers.\n"); return; } /* Init jumbo RX ring. */ if (ifp->if_mtu > ETHERMTU + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN) { if (ti_init_rx_ring_jumbo(sc) != 0) { /* XXX */ device_printf(sc->ti_dev, "no memory for jumbo Rx buffers.\n"); return; } } /* * If this is a Tigon 2, we can also configure the * mini ring. */ if (sc->ti_hwrev == TI_HWREV_TIGON_II) { if (ti_init_rx_ring_mini(sc) != 0) { /* XXX */ device_printf(sc->ti_dev, "no memory for mini Rx buffers.\n"); return; } } CSR_WRITE_4(sc, TI_GCR_RXRETURNCONS_IDX, 0); sc->ti_rx_saved_considx = 0; /* Init TX ring. */ ti_init_tx_ring(sc); /* Tell firmware we're alive. */ TI_DO_CMD(TI_CMD_HOST_STATE, TI_CMD_CODE_STACK_UP, 0); /* Enable host interrupts. */ CSR_WRITE_4(sc, TI_MB_HOSTINTR, 0); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; callout_reset(&sc->ti_watchdog, hz, ti_watchdog, sc); /* * Make sure to set media properly. We have to do this * here since we have to issue commands in order to set * the link negotiation and we can't issue commands until * the firmware is running. */ ifm = &sc->ifmedia; tmp = ifm->ifm_media; ifm->ifm_media = ifm->ifm_cur->ifm_media; ti_ifmedia_upd_locked(sc); ifm->ifm_media = tmp; } /* * Set media options. */ static int ti_ifmedia_upd(struct ifnet *ifp) { struct ti_softc *sc; int error; sc = ifp->if_softc; TI_LOCK(sc); error = ti_ifmedia_upd_locked(sc); TI_UNLOCK(sc); return (error); } static int ti_ifmedia_upd_locked(struct ti_softc *sc) { struct ifmedia *ifm; struct ti_cmd_desc cmd; uint32_t flowctl; ifm = &sc->ifmedia; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); flowctl = 0; switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: /* * Transmit flow control doesn't work on the Tigon 1. */ flowctl = TI_GLNK_RX_FLOWCTL_Y; /* * Transmit flow control can also cause problems on the * Tigon 2, apparantly with both the copper and fiber * boards. The symptom is that the interface will just * hang. This was reproduced with Alteon 180 switches. */ #if 0 if (sc->ti_hwrev != TI_HWREV_TIGON) flowctl |= TI_GLNK_TX_FLOWCTL_Y; #endif CSR_WRITE_4(sc, TI_GCR_GLINK, TI_GLNK_PREF|TI_GLNK_1000MB| TI_GLNK_FULL_DUPLEX| flowctl | TI_GLNK_AUTONEGENB|TI_GLNK_ENB); flowctl = TI_LNK_RX_FLOWCTL_Y; #if 0 if (sc->ti_hwrev != TI_HWREV_TIGON) flowctl |= TI_LNK_TX_FLOWCTL_Y; #endif CSR_WRITE_4(sc, TI_GCR_LINK, TI_LNK_100MB|TI_LNK_10MB| TI_LNK_FULL_DUPLEX|TI_LNK_HALF_DUPLEX| flowctl | TI_LNK_AUTONEGENB|TI_LNK_ENB); TI_DO_CMD(TI_CMD_LINK_NEGOTIATION, TI_CMD_CODE_NEGOTIATE_BOTH, 0); break; case IFM_1000_SX: case IFM_1000_T: flowctl = TI_GLNK_RX_FLOWCTL_Y; #if 0 if (sc->ti_hwrev != TI_HWREV_TIGON) flowctl |= TI_GLNK_TX_FLOWCTL_Y; #endif CSR_WRITE_4(sc, TI_GCR_GLINK, TI_GLNK_PREF|TI_GLNK_1000MB| flowctl |TI_GLNK_ENB); CSR_WRITE_4(sc, TI_GCR_LINK, 0); if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX) { TI_SETBIT(sc, TI_GCR_GLINK, TI_GLNK_FULL_DUPLEX); } TI_DO_CMD(TI_CMD_LINK_NEGOTIATION, TI_CMD_CODE_NEGOTIATE_GIGABIT, 0); break; case IFM_100_FX: case IFM_10_FL: case IFM_100_TX: case IFM_10_T: flowctl = TI_LNK_RX_FLOWCTL_Y; #if 0 if (sc->ti_hwrev != TI_HWREV_TIGON) flowctl |= TI_LNK_TX_FLOWCTL_Y; #endif CSR_WRITE_4(sc, TI_GCR_GLINK, 0); CSR_WRITE_4(sc, TI_GCR_LINK, TI_LNK_ENB|TI_LNK_PREF|flowctl); if (IFM_SUBTYPE(ifm->ifm_media) == IFM_100_FX || IFM_SUBTYPE(ifm->ifm_media) == IFM_100_TX) { TI_SETBIT(sc, TI_GCR_LINK, TI_LNK_100MB); } else { TI_SETBIT(sc, TI_GCR_LINK, TI_LNK_10MB); } if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX) { TI_SETBIT(sc, TI_GCR_LINK, TI_LNK_FULL_DUPLEX); } else { TI_SETBIT(sc, TI_GCR_LINK, TI_LNK_HALF_DUPLEX); } TI_DO_CMD(TI_CMD_LINK_NEGOTIATION, TI_CMD_CODE_NEGOTIATE_10_100, 0); break; } return (0); } /* * Report current media status. */ static void ti_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct ti_softc *sc; uint32_t media = 0; sc = ifp->if_softc; TI_LOCK(sc); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (sc->ti_linkstat == TI_EV_CODE_LINK_DOWN) { TI_UNLOCK(sc); return; } ifmr->ifm_status |= IFM_ACTIVE; if (sc->ti_linkstat == TI_EV_CODE_GIG_LINK_UP) { media = CSR_READ_4(sc, TI_GCR_GLINK_STAT); if (sc->ti_copper) ifmr->ifm_active |= IFM_1000_T; else ifmr->ifm_active |= IFM_1000_SX; if (media & TI_GLNK_FULL_DUPLEX) ifmr->ifm_active |= IFM_FDX; else ifmr->ifm_active |= IFM_HDX; } else if (sc->ti_linkstat == TI_EV_CODE_LINK_UP) { media = CSR_READ_4(sc, TI_GCR_LINK_STAT); if (sc->ti_copper) { if (media & TI_LNK_100MB) ifmr->ifm_active |= IFM_100_TX; if (media & TI_LNK_10MB) ifmr->ifm_active |= IFM_10_T; } else { if (media & TI_LNK_100MB) ifmr->ifm_active |= IFM_100_FX; if (media & TI_LNK_10MB) ifmr->ifm_active |= IFM_10_FL; } if (media & TI_LNK_FULL_DUPLEX) ifmr->ifm_active |= IFM_FDX; if (media & TI_LNK_HALF_DUPLEX) ifmr->ifm_active |= IFM_HDX; } TI_UNLOCK(sc); } static int ti_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ti_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *) data; struct ti_cmd_desc cmd; int mask, error = 0; switch (command) { case SIOCSIFMTU: TI_LOCK(sc); if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > TI_JUMBO_MTU) error = EINVAL; else { ifp->if_mtu = ifr->ifr_mtu; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; ti_init_locked(sc); } } TI_UNLOCK(sc); break; case SIOCSIFFLAGS: TI_LOCK(sc); if (ifp->if_flags & IFF_UP) { /* * If only the state of the PROMISC flag changed, * then just use the 'set promisc mode' command * instead of reinitializing the entire NIC. Doing * a full re-init means reloading the firmware and * waiting for it to start up, which may take a * second or two. */ if (ifp->if_drv_flags & IFF_DRV_RUNNING && ifp->if_flags & IFF_PROMISC && !(sc->ti_if_flags & IFF_PROMISC)) { TI_DO_CMD(TI_CMD_SET_PROMISC_MODE, TI_CMD_CODE_PROMISC_ENB, 0); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING && !(ifp->if_flags & IFF_PROMISC) && sc->ti_if_flags & IFF_PROMISC) { TI_DO_CMD(TI_CMD_SET_PROMISC_MODE, TI_CMD_CODE_PROMISC_DIS, 0); } else ti_init_locked(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ti_stop(sc); } } sc->ti_if_flags = ifp->if_flags; TI_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: TI_LOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) ti_setmulti(sc); TI_UNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->ifmedia, command); break; case SIOCSIFCAP: TI_LOCK(sc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if ((mask & IFCAP_TXCSUM) != 0 && (ifp->if_capabilities & IFCAP_TXCSUM) != 0) { ifp->if_capenable ^= IFCAP_TXCSUM; if ((ifp->if_capenable & IFCAP_TXCSUM) != 0) ifp->if_hwassist |= TI_CSUM_FEATURES; else ifp->if_hwassist &= ~TI_CSUM_FEATURES; } if ((mask & IFCAP_RXCSUM) != 0 && (ifp->if_capabilities & IFCAP_RXCSUM) != 0) ifp->if_capenable ^= IFCAP_RXCSUM; if ((mask & IFCAP_VLAN_HWTAGGING) != 0 && (ifp->if_capabilities & IFCAP_VLAN_HWTAGGING) != 0) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if ((mask & IFCAP_VLAN_HWCSUM) != 0 && (ifp->if_capabilities & IFCAP_VLAN_HWCSUM) != 0) ifp->if_capenable ^= IFCAP_VLAN_HWCSUM; if ((mask & (IFCAP_TXCSUM | IFCAP_RXCSUM | IFCAP_VLAN_HWTAGGING)) != 0) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; ti_init_locked(sc); } } TI_UNLOCK(sc); VLAN_CAPABILITIES(ifp); break; default: error = ether_ioctl(ifp, command, data); break; } return (error); } static int ti_open(struct cdev *dev, int flags, int fmt, struct thread *td) { struct ti_softc *sc; sc = dev->si_drv1; if (sc == NULL) return (ENODEV); TI_LOCK(sc); sc->ti_flags |= TI_FLAG_DEBUGING; TI_UNLOCK(sc); return (0); } static int ti_close(struct cdev *dev, int flag, int fmt, struct thread *td) { struct ti_softc *sc; sc = dev->si_drv1; if (sc == NULL) return (ENODEV); TI_LOCK(sc); sc->ti_flags &= ~TI_FLAG_DEBUGING; TI_UNLOCK(sc); return (0); } /* * This ioctl routine goes along with the Tigon character device. */ static int ti_ioctl2(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { struct ti_softc *sc; int error; sc = dev->si_drv1; if (sc == NULL) return (ENODEV); error = 0; switch (cmd) { case TIIOCGETSTATS: { struct ti_stats *outstats; outstats = (struct ti_stats *)addr; TI_LOCK(sc); bus_dmamap_sync(sc->ti_cdata.ti_gib_tag, sc->ti_cdata.ti_gib_map, BUS_DMASYNC_POSTREAD); bcopy(&sc->ti_rdata.ti_info->ti_stats, outstats, sizeof(struct ti_stats)); bus_dmamap_sync(sc->ti_cdata.ti_gib_tag, sc->ti_cdata.ti_gib_map, BUS_DMASYNC_PREREAD); TI_UNLOCK(sc); break; } case TIIOCGETPARAMS: { struct ti_params *params; params = (struct ti_params *)addr; TI_LOCK(sc); params->ti_stat_ticks = sc->ti_stat_ticks; params->ti_rx_coal_ticks = sc->ti_rx_coal_ticks; params->ti_tx_coal_ticks = sc->ti_tx_coal_ticks; params->ti_rx_max_coal_bds = sc->ti_rx_max_coal_bds; params->ti_tx_max_coal_bds = sc->ti_tx_max_coal_bds; params->ti_tx_buf_ratio = sc->ti_tx_buf_ratio; params->param_mask = TI_PARAM_ALL; TI_UNLOCK(sc); break; } case TIIOCSETPARAMS: { struct ti_params *params; params = (struct ti_params *)addr; TI_LOCK(sc); if (params->param_mask & TI_PARAM_STAT_TICKS) { sc->ti_stat_ticks = params->ti_stat_ticks; CSR_WRITE_4(sc, TI_GCR_STAT_TICKS, sc->ti_stat_ticks); } if (params->param_mask & TI_PARAM_RX_COAL_TICKS) { sc->ti_rx_coal_ticks = params->ti_rx_coal_ticks; CSR_WRITE_4(sc, TI_GCR_RX_COAL_TICKS, sc->ti_rx_coal_ticks); } if (params->param_mask & TI_PARAM_TX_COAL_TICKS) { sc->ti_tx_coal_ticks = params->ti_tx_coal_ticks; CSR_WRITE_4(sc, TI_GCR_TX_COAL_TICKS, sc->ti_tx_coal_ticks); } if (params->param_mask & TI_PARAM_RX_COAL_BDS) { sc->ti_rx_max_coal_bds = params->ti_rx_max_coal_bds; CSR_WRITE_4(sc, TI_GCR_RX_MAX_COAL_BD, sc->ti_rx_max_coal_bds); } if (params->param_mask & TI_PARAM_TX_COAL_BDS) { sc->ti_tx_max_coal_bds = params->ti_tx_max_coal_bds; CSR_WRITE_4(sc, TI_GCR_TX_MAX_COAL_BD, sc->ti_tx_max_coal_bds); } if (params->param_mask & TI_PARAM_TX_BUF_RATIO) { sc->ti_tx_buf_ratio = params->ti_tx_buf_ratio; CSR_WRITE_4(sc, TI_GCR_TX_BUFFER_RATIO, sc->ti_tx_buf_ratio); } TI_UNLOCK(sc); break; } case TIIOCSETTRACE: { ti_trace_type trace_type; trace_type = *(ti_trace_type *)addr; /* * Set tracing to whatever the user asked for. Setting * this register to 0 should have the effect of disabling * tracing. */ TI_LOCK(sc); CSR_WRITE_4(sc, TI_GCR_NIC_TRACING, trace_type); TI_UNLOCK(sc); break; } case TIIOCGETTRACE: { struct ti_trace_buf *trace_buf; uint32_t trace_start, cur_trace_ptr, trace_len; trace_buf = (struct ti_trace_buf *)addr; TI_LOCK(sc); trace_start = CSR_READ_4(sc, TI_GCR_NICTRACE_START); cur_trace_ptr = CSR_READ_4(sc, TI_GCR_NICTRACE_PTR); trace_len = CSR_READ_4(sc, TI_GCR_NICTRACE_LEN); #if 0 if_printf(sc->ti_ifp, "trace_start = %#x, cur_trace_ptr = %#x, " "trace_len = %d\n", trace_start, cur_trace_ptr, trace_len); if_printf(sc->ti_ifp, "trace_buf->buf_len = %d\n", trace_buf->buf_len); #endif error = ti_copy_mem(sc, trace_start, min(trace_len, trace_buf->buf_len), (caddr_t)trace_buf->buf, 1, 1); if (error == 0) { trace_buf->fill_len = min(trace_len, trace_buf->buf_len); if (cur_trace_ptr < trace_start) trace_buf->cur_trace_ptr = trace_start - cur_trace_ptr; else trace_buf->cur_trace_ptr = cur_trace_ptr - trace_start; } else trace_buf->fill_len = 0; TI_UNLOCK(sc); break; } /* * For debugging, five ioctls are needed: * ALT_ATTACH * ALT_READ_TG_REG * ALT_WRITE_TG_REG * ALT_READ_TG_MEM * ALT_WRITE_TG_MEM */ case ALT_ATTACH: /* * From what I can tell, Alteon's Solaris Tigon driver * only has one character device, so you have to attach * to the Tigon board you're interested in. This seems * like a not-so-good way to do things, since unless you * subsequently specify the unit number of the device * you're interested in every ioctl, you'll only be * able to debug one board at a time. */ break; case ALT_READ_TG_MEM: case ALT_WRITE_TG_MEM: { struct tg_mem *mem_param; uint32_t sram_end, scratch_end; mem_param = (struct tg_mem *)addr; if (sc->ti_hwrev == TI_HWREV_TIGON) { sram_end = TI_END_SRAM_I; scratch_end = TI_END_SCRATCH_I; } else { sram_end = TI_END_SRAM_II; scratch_end = TI_END_SCRATCH_II; } /* * For now, we'll only handle accessing regular SRAM, * nothing else. */ TI_LOCK(sc); if (mem_param->tgAddr >= TI_BEG_SRAM && mem_param->tgAddr + mem_param->len <= sram_end) { /* * In this instance, we always copy to/from user * space, so the user space argument is set to 1. */ error = ti_copy_mem(sc, mem_param->tgAddr, mem_param->len, mem_param->userAddr, 1, cmd == ALT_READ_TG_MEM ? 1 : 0); } else if (mem_param->tgAddr >= TI_BEG_SCRATCH && mem_param->tgAddr <= scratch_end) { error = ti_copy_scratch(sc, mem_param->tgAddr, mem_param->len, mem_param->userAddr, 1, cmd == ALT_READ_TG_MEM ? 1 : 0, TI_PROCESSOR_A); } else if (mem_param->tgAddr >= TI_BEG_SCRATCH_B_DEBUG && mem_param->tgAddr <= TI_BEG_SCRATCH_B_DEBUG) { if (sc->ti_hwrev == TI_HWREV_TIGON) { if_printf(sc->ti_ifp, "invalid memory range for Tigon I\n"); error = EINVAL; break; } error = ti_copy_scratch(sc, mem_param->tgAddr - TI_SCRATCH_DEBUG_OFF, mem_param->len, mem_param->userAddr, 1, cmd == ALT_READ_TG_MEM ? 1 : 0, TI_PROCESSOR_B); } else { if_printf(sc->ti_ifp, "memory address %#x len %d is " "out of supported range\n", mem_param->tgAddr, mem_param->len); error = EINVAL; } TI_UNLOCK(sc); break; } case ALT_READ_TG_REG: case ALT_WRITE_TG_REG: { struct tg_reg *regs; uint32_t tmpval; regs = (struct tg_reg *)addr; /* * Make sure the address in question isn't out of range. */ if (regs->addr > TI_REG_MAX) { error = EINVAL; break; } TI_LOCK(sc); if (cmd == ALT_READ_TG_REG) { bus_space_read_region_4(sc->ti_btag, sc->ti_bhandle, regs->addr, &tmpval, 1); regs->data = ntohl(tmpval); #if 0 if ((regs->addr == TI_CPU_STATE) || (regs->addr == TI_CPU_CTL_B)) { if_printf(sc->ti_ifp, "register %#x = %#x\n", regs->addr, tmpval); } #endif } else { tmpval = htonl(regs->data); bus_space_write_region_4(sc->ti_btag, sc->ti_bhandle, regs->addr, &tmpval, 1); } TI_UNLOCK(sc); break; } default: error = ENOTTY; break; } return (error); } static void ti_watchdog(void *arg) { struct ti_softc *sc; struct ifnet *ifp; sc = arg; TI_LOCK_ASSERT(sc); callout_reset(&sc->ti_watchdog, hz, ti_watchdog, sc); if (sc->ti_timer == 0 || --sc->ti_timer > 0) return; /* * When we're debugging, the chip is often stopped for long periods * of time, and that would normally cause the watchdog timer to fire. * Since that impedes debugging, we don't want to do that. */ if (sc->ti_flags & TI_FLAG_DEBUGING) return; ifp = sc->ti_ifp; if_printf(ifp, "watchdog timeout -- resetting\n"); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; ti_init_locked(sc); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } /* * Stop the adapter and free any mbufs allocated to the * RX and TX lists. */ static void ti_stop(struct ti_softc *sc) { struct ifnet *ifp; struct ti_cmd_desc cmd; TI_LOCK_ASSERT(sc); ifp = sc->ti_ifp; /* Disable host interrupts. */ CSR_WRITE_4(sc, TI_MB_HOSTINTR, 1); /* * Tell firmware we're shutting down. */ TI_DO_CMD(TI_CMD_HOST_STATE, TI_CMD_CODE_STACK_DOWN, 0); /* Halt and reinitialize. */ if (ti_chipinit(sc) == 0) { ti_mem_zero(sc, 0x2000, 0x100000 - 0x2000); /* XXX ignore init errors. */ ti_chipinit(sc); } /* Free the RX lists. */ ti_free_rx_ring_std(sc); /* Free jumbo RX list. */ ti_free_rx_ring_jumbo(sc); /* Free mini RX list. */ ti_free_rx_ring_mini(sc); /* Free TX buffers. */ ti_free_tx_ring(sc); sc->ti_ev_prodidx.ti_idx = 0; sc->ti_return_prodidx.ti_idx = 0; sc->ti_tx_considx.ti_idx = 0; sc->ti_tx_saved_considx = TI_TXCONS_UNSET; ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); callout_stop(&sc->ti_watchdog); } /* * Stop all chip I/O so that the kernel's probe routines don't * get confused by errant DMAs when rebooting. */ static int ti_shutdown(device_t dev) { struct ti_softc *sc; sc = device_get_softc(dev); TI_LOCK(sc); ti_chipinit(sc); TI_UNLOCK(sc); return (0); } static void ti_sysctl_node(struct ti_softc *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *child; char tname[32]; ctx = device_get_sysctl_ctx(sc->ti_dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->ti_dev)); /* Use DAC */ sc->ti_dac = 1; snprintf(tname, sizeof(tname), "dev.ti.%d.dac", device_get_unit(sc->ti_dev)); TUNABLE_INT_FETCH(tname, &sc->ti_dac); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rx_coal_ticks", CTLFLAG_RW, &sc->ti_rx_coal_ticks, 0, "Receive coalcesced ticks"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rx_max_coal_bds", CTLFLAG_RW, &sc->ti_rx_max_coal_bds, 0, "Receive max coalcesced BDs"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tx_coal_ticks", CTLFLAG_RW, &sc->ti_tx_coal_ticks, 0, "Send coalcesced ticks"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tx_max_coal_bds", CTLFLAG_RW, &sc->ti_tx_max_coal_bds, 0, "Send max coalcesced BDs"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tx_buf_ratio", CTLFLAG_RW, &sc->ti_tx_buf_ratio, 0, "Ratio of NIC memory devoted to TX buffer"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "stat_ticks", CTLFLAG_RW, &sc->ti_stat_ticks, 0, "Number of clock ticks for statistics update interval"); /* Pull in device tunables. */ sc->ti_rx_coal_ticks = 170; resource_int_value(device_get_name(sc->ti_dev), device_get_unit(sc->ti_dev), "rx_coal_ticks", &sc->ti_rx_coal_ticks); sc->ti_rx_max_coal_bds = 64; resource_int_value(device_get_name(sc->ti_dev), device_get_unit(sc->ti_dev), "rx_max_coal_bds", &sc->ti_rx_max_coal_bds); sc->ti_tx_coal_ticks = TI_TICKS_PER_SEC / 500; resource_int_value(device_get_name(sc->ti_dev), device_get_unit(sc->ti_dev), "tx_coal_ticks", &sc->ti_tx_coal_ticks); sc->ti_tx_max_coal_bds = 32; resource_int_value(device_get_name(sc->ti_dev), device_get_unit(sc->ti_dev), "tx_max_coal_bds", &sc->ti_tx_max_coal_bds); sc->ti_tx_buf_ratio = 21; resource_int_value(device_get_name(sc->ti_dev), device_get_unit(sc->ti_dev), "tx_buf_ratio", &sc->ti_tx_buf_ratio); sc->ti_stat_ticks = 2 * TI_TICKS_PER_SEC; resource_int_value(device_get_name(sc->ti_dev), device_get_unit(sc->ti_dev), "stat_ticks", &sc->ti_stat_ticks); } Index: head/sys/kern/uipc_mbuf.c =================================================================== --- head/sys/kern/uipc_mbuf.c (revision 293438) +++ head/sys/kern/uipc_mbuf.c (revision 293439) @@ -1,2061 +1,2065 @@ /*- * Copyright (c) 1982, 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include "opt_mbuf_stress_test.h" #include "opt_mbuf_profiling.h" #include #include #include #include #include #include #include #include #include #include #include int max_linkhdr; int max_protohdr; int max_hdr; int max_datalen; #ifdef MBUF_STRESS_TEST int m_defragpackets; int m_defragbytes; int m_defraguseless; int m_defragfailure; int m_defragrandomfailures; #endif /* * sysctl(8) exported objects */ SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD, &max_linkhdr, 0, "Size of largest link layer header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD, &max_protohdr, 0, "Size of largest protocol layer header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD, &max_hdr, 0, "Size of largest link plus protocol header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD, &max_datalen, 0, "Minimum space left in mbuf after max_hdr"); #ifdef MBUF_STRESS_TEST SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD, &m_defragpackets, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD, &m_defragbytes, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD, &m_defraguseless, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD, &m_defragfailure, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, &m_defragrandomfailures, 0, ""); #endif /* * Ensure the correct size of various mbuf parameters. It could be off due * to compiler-induced padding and alignment artifacts. */ CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN); CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN); /* * mbuf data storage should be 64-bit aligned regardless of architectural * pointer size; check this is the case with and without a packet header. */ CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0); CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0); /* * While the specific values here don't matter too much (i.e., +/- a few * words), we do want to ensure that changes to these values are carefully * reasoned about and properly documented. This is especially the case as * network-protocol and device-driver modules encode these layouts, and must * be recompiled if the structures change. Check these values at compile time * against the ones documented in comments in mbuf.h. * * NB: Possibly they should be documented there via #define's and not just * comments. */ #if defined(__LP64__) CTASSERT(offsetof(struct mbuf, m_dat) == 32); CTASSERT(sizeof(struct pkthdr) == 56); CTASSERT(sizeof(struct m_ext) == 48); #else CTASSERT(offsetof(struct mbuf, m_dat) == 24); CTASSERT(sizeof(struct pkthdr) == 48); CTASSERT(sizeof(struct m_ext) == 28); #endif /* * Assert that the queue(3) macros produce code of the same size as an old * plain pointer does. */ #ifdef INVARIANTS static struct mbuf m_assertbuf; CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next)); CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next)); CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt)); CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt)); #endif /* * m_get2() allocates minimum mbuf that would fit "size" argument. */ struct mbuf * m_get2(int size, int how, short type, int flags) { struct mb_args args; struct mbuf *m, *n; args.flags = flags; args.type = type; if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) return (uma_zalloc_arg(zone_mbuf, &args, how)); if (size <= MCLBYTES) return (uma_zalloc_arg(zone_pack, &args, how)); if (size > MJUMPAGESIZE) return (NULL); m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); n = uma_zalloc_arg(zone_jumbop, m, how); if (n == NULL) { uma_zfree(zone_mbuf, m); return (NULL); } return (m); } /* * m_getjcl() returns an mbuf with a cluster of the specified size attached. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ struct mbuf * m_getjcl(int how, short type, int flags, int size) { struct mb_args args; struct mbuf *m, *n; uma_zone_t zone; if (size == MCLBYTES) return m_getcl(how, type, flags); args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); zone = m_getzone(size); n = uma_zalloc_arg(zone, m, how); if (n == NULL) { uma_zfree(zone_mbuf, m); return (NULL); } return (m); } /* * Allocate a given length worth of mbufs and/or clusters (whatever fits * best) and return a pointer to the top of the allocated chain. If an * existing mbuf chain is provided, then we will append the new chain * to the existing one but still return the top of the newly allocated * chain. */ struct mbuf * m_getm2(struct mbuf *m, int len, int how, short type, int flags) { struct mbuf *mb, *nm = NULL, *mtail = NULL; KASSERT(len >= 0, ("%s: len is < 0", __func__)); /* Validate flags. */ flags &= (M_PKTHDR | M_EOR); /* Packet header mbuf must be first in chain. */ if ((flags & M_PKTHDR) && m != NULL) flags &= ~M_PKTHDR; /* Loop and append maximum sized mbufs to the chain tail. */ while (len > 0) { if (len > MCLBYTES) mb = m_getjcl(how, type, (flags & M_PKTHDR), MJUMPAGESIZE); else if (len >= MINCLSIZE) mb = m_getcl(how, type, (flags & M_PKTHDR)); else if (flags & M_PKTHDR) mb = m_gethdr(how, type); else mb = m_get(how, type); /* Fail the whole operation if one mbuf can't be allocated. */ if (mb == NULL) { if (nm != NULL) m_freem(nm); return (NULL); } /* Book keeping. */ len -= M_SIZE(mb); if (mtail != NULL) mtail->m_next = mb; else nm = mb; mtail = mb; flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ } if (flags & M_EOR) mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ /* If mbuf was supplied, append new chain to the end of it. */ if (m != NULL) { for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) ; mtail->m_next = nm; mtail->m_flags &= ~M_EOR; } else m = nm; return (m); } /* * Free an entire chain of mbufs and associated external buffers, if * applicable. */ void m_freem(struct mbuf *mb) { while (mb != NULL) mb = m_free(mb); } /*- * Configure a provided mbuf to refer to the provided external storage * buffer and setup a reference count for said buffer. If the setting * up of the reference count fails, the M_EXT bit will not be set. If * successfull, the M_EXT bit is set in the mbuf's flags. * * Arguments: * mb The existing mbuf to which to attach the provided buffer. * buf The address of the provided external storage buffer. * size The size of the provided buffer. * freef A pointer to a routine that is responsible for freeing the * provided external storage buffer. * args A pointer to an argument structure (of any type) to be passed * to the provided freef routine (may be NULL). * flags Any other flags to be passed to the provided mbuf. * type The type that the external storage buffer should be * labeled with. * * Returns: * Nothing. */ int m_extadd(struct mbuf *mb, caddr_t buf, u_int size, void (*freef)(struct mbuf *, void *, void *), void *arg1, void *arg2, int flags, int type, int wait) { KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); if (type != EXT_EXTREF) mb->m_ext.ext_cnt = uma_zalloc(zone_ext_refcnt, wait); if (mb->m_ext.ext_cnt == NULL) return (ENOMEM); *(mb->m_ext.ext_cnt) = 1; mb->m_flags |= (M_EXT | flags); mb->m_ext.ext_buf = buf; mb->m_data = mb->m_ext.ext_buf; mb->m_ext.ext_size = size; mb->m_ext.ext_free = freef; mb->m_ext.ext_arg1 = arg1; mb->m_ext.ext_arg2 = arg2; mb->m_ext.ext_type = type; mb->m_ext.ext_flags = 0; return (0); } /* * Non-directly-exported function to clean up after mbufs with M_EXT * storage attached to them if the reference count hits 1. */ void mb_free_ext(struct mbuf *m) { int freembuf; KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); /* * Check if the header is embedded in the cluster. */ freembuf = (m->m_flags & M_NOFREE) ? 0 : 1; switch (m->m_ext.ext_type) { case EXT_SFBUF: sf_ext_free(m->m_ext.ext_arg1, m->m_ext.ext_arg2); break; + case EXT_SFBUF_NOCACHE: + sf_ext_free_nocache(m->m_ext.ext_arg1, m->m_ext.ext_arg2); + break; default: KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); /* * Free attached storage if this mbuf is the only * reference to it. */ if (*(m->m_ext.ext_cnt) != 1) { if (atomic_fetchadd_int(m->m_ext.ext_cnt, -1) != 1) break; } switch (m->m_ext.ext_type) { case EXT_PACKET: /* The packet zone is special. */ if (*(m->m_ext.ext_cnt) == 0) *(m->m_ext.ext_cnt) = 1; uma_zfree(zone_pack, m); return; /* Job done. */ case EXT_CLUSTER: uma_zfree(zone_clust, m->m_ext.ext_buf); break; case EXT_JUMBOP: uma_zfree(zone_jumbop, m->m_ext.ext_buf); break; case EXT_JUMBO9: uma_zfree(zone_jumbo9, m->m_ext.ext_buf); break; case EXT_JUMBO16: uma_zfree(zone_jumbo16, m->m_ext.ext_buf); break; case EXT_NET_DRV: case EXT_MOD_TYPE: case EXT_DISPOSABLE: *(m->m_ext.ext_cnt) = 0; uma_zfree(zone_ext_refcnt, __DEVOLATILE(u_int *, m->m_ext.ext_cnt)); /* FALLTHROUGH */ case EXT_EXTREF: KASSERT(m->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); (*(m->m_ext.ext_free))(m, m->m_ext.ext_arg1, m->m_ext.ext_arg2); break; default: KASSERT(m->m_ext.ext_type == 0, ("%s: unknown ext_type", __func__)); } } if (freembuf) uma_zfree(zone_mbuf, m); } /* * Attach the cluster from *m to *n, set up m_ext in *n * and bump the refcount of the cluster. */ void mb_dupcl(struct mbuf *n, const struct mbuf *m) { KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); KASSERT(!(n->m_flags & M_EXT), ("%s: M_EXT set on %p", __func__, n)); switch (m->m_ext.ext_type) { case EXT_SFBUF: + case EXT_SFBUF_NOCACHE: sf_ext_ref(m->m_ext.ext_arg1, m->m_ext.ext_arg2); break; default: KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); if (*(m->m_ext.ext_cnt) == 1) *(m->m_ext.ext_cnt) += 1; else atomic_add_int(m->m_ext.ext_cnt, 1); } n->m_ext = m->m_ext; n->m_flags |= M_EXT; n->m_flags |= m->m_flags & M_RDONLY; } void m_demote_pkthdr(struct mbuf *m) { M_ASSERTPKTHDR(m); m_tag_delete_chain(m, NULL); m->m_flags &= ~M_PKTHDR; bzero(&m->m_pkthdr, sizeof(struct pkthdr)); } /* * Clean up mbuf (chain) from any tags and packet headers. * If "all" is set then the first mbuf in the chain will be * cleaned too. */ void m_demote(struct mbuf *m0, int all, int flags) { struct mbuf *m; for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) { KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p", __func__, m, m0)); if (m->m_flags & M_PKTHDR) m_demote_pkthdr(m); m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags); } } /* * Sanity checks on mbuf (chain) for use in KASSERT() and general * debugging. * Returns 0 or panics when bad and 1 on all tests passed. * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they * blow up later. */ int m_sanity(struct mbuf *m0, int sanitize) { struct mbuf *m; caddr_t a, b; int pktlen = 0; #ifdef INVARIANTS #define M_SANITY_ACTION(s) panic("mbuf %p: " s, m) #else #define M_SANITY_ACTION(s) printf("mbuf %p: " s, m) #endif for (m = m0; m != NULL; m = m->m_next) { /* * Basic pointer checks. If any of these fails then some * unrelated kernel memory before or after us is trashed. * No way to recover from that. */ a = M_START(m); b = a + M_SIZE(m); if ((caddr_t)m->m_data < a) M_SANITY_ACTION("m_data outside mbuf data range left"); if ((caddr_t)m->m_data > b) M_SANITY_ACTION("m_data outside mbuf data range right"); if ((caddr_t)m->m_data + m->m_len > b) M_SANITY_ACTION("m_data + m_len exeeds mbuf space"); /* m->m_nextpkt may only be set on first mbuf in chain. */ if (m != m0 && m->m_nextpkt != NULL) { if (sanitize) { m_freem(m->m_nextpkt); m->m_nextpkt = (struct mbuf *)0xDEADC0DE; } else M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf"); } /* packet length (not mbuf length!) calculation */ if (m0->m_flags & M_PKTHDR) pktlen += m->m_len; /* m_tags may only be attached to first mbuf in chain. */ if (m != m0 && m->m_flags & M_PKTHDR && !SLIST_EMPTY(&m->m_pkthdr.tags)) { if (sanitize) { m_tag_delete_chain(m, NULL); /* put in 0xDEADC0DE perhaps? */ } else M_SANITY_ACTION("m_tags on in-chain mbuf"); } /* M_PKTHDR may only be set on first mbuf in chain */ if (m != m0 && m->m_flags & M_PKTHDR) { if (sanitize) { bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); m->m_flags &= ~M_PKTHDR; /* put in 0xDEADCODE and leave hdr flag in */ } else M_SANITY_ACTION("M_PKTHDR on in-chain mbuf"); } } m = m0; if (pktlen && pktlen != m->m_pkthdr.len) { if (sanitize) m->m_pkthdr.len = 0; else M_SANITY_ACTION("m_pkthdr.len != mbuf chain length"); } return 1; #undef M_SANITY_ACTION } /* * "Move" mbuf pkthdr from "from" to "to". * "from" must have M_PKTHDR set, and "to" must be empty. */ void m_move_pkthdr(struct mbuf *to, struct mbuf *from) { #if 0 /* see below for why these are not enabled */ M_ASSERTPKTHDR(to); /* Note: with MAC, this may not be a good assertion. */ KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_move_pkthdr: to has tags")); #endif #ifdef MAC /* * XXXMAC: It could be this should also occur for non-MAC? */ if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; /* especially tags */ SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ from->m_flags &= ~M_PKTHDR; } /* * Duplicate "from"'s mbuf pkthdr in "to". * "from" must have M_PKTHDR set, and "to" must be empty. * In particular, this does a deep copy of the packet tags. */ int m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how) { #if 0 /* * The mbuf allocator only initializes the pkthdr * when the mbuf is allocated with m_gethdr(). Many users * (e.g. m_copy*, m_prepend) use m_get() and then * smash the pkthdr as needed causing these * assertions to trip. For now just disable them. */ M_ASSERTPKTHDR(to); /* Note: with MAC, this may not be a good assertion. */ KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags")); #endif MBUF_CHECKSLEEP(how); #ifdef MAC if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; SLIST_INIT(&to->m_pkthdr.tags); return (m_tag_copy_chain(to, from, how)); } /* * Lesser-used path for M_PREPEND: * allocate new mbuf to prepend to chain, * copy junk along. */ struct mbuf * m_prepend(struct mbuf *m, int len, int how) { struct mbuf *mn; if (m->m_flags & M_PKTHDR) mn = m_gethdr(how, m->m_type); else mn = m_get(how, m->m_type); if (mn == NULL) { m_freem(m); return (NULL); } if (m->m_flags & M_PKTHDR) m_move_pkthdr(mn, m); mn->m_next = m; m = mn; if (len < M_SIZE(m)) M_ALIGN(m, len); m->m_len = len; return (m); } /* * Make a copy of an mbuf chain starting "off0" bytes from the beginning, * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. */ struct mbuf * m_copym(const struct mbuf *m, int off0, int len, int wait) { struct mbuf *n, **np; int off = off0; struct mbuf *top; int copyhdr = 0; KASSERT(off >= 0, ("m_copym, negative off %d", off)); KASSERT(len >= 0, ("m_copym, negative len %d", len)); MBUF_CHECKSLEEP(wait); if (off == 0 && m->m_flags & M_PKTHDR) copyhdr = 1; while (off > 0) { KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } np = ⊤ top = 0; while (len > 0) { if (m == NULL) { KASSERT(len == M_COPYALL, ("m_copym, length > size of mbuf chain")); break; } if (copyhdr) n = m_gethdr(wait, m->m_type); else n = m_get(wait, m->m_type); *np = n; if (n == NULL) goto nospace; if (copyhdr) { if (!m_dup_pkthdr(n, m, wait)) goto nospace; if (len == M_COPYALL) n->m_pkthdr.len -= off0; else n->m_pkthdr.len = len; copyhdr = 0; } n->m_len = min(len, m->m_len - off); if (m->m_flags & M_EXT) { n->m_data = m->m_data + off; mb_dupcl(n, m); } else bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), (u_int)n->m_len); if (len != M_COPYALL) len -= n->m_len; off = 0; m = m->m_next; np = &n->m_next; } return (top); nospace: m_freem(top); return (NULL); } /* * Copy an entire packet, including header (which must be present). * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. * Preserve alignment of the first mbuf so if the creator has left * some room at the beginning (e.g. for inserting protocol headers) * the copies still have the room available. */ struct mbuf * m_copypacket(struct mbuf *m, int how) { struct mbuf *top, *n, *o; MBUF_CHECKSLEEP(how); n = m_get(how, m->m_type); top = n; if (n == NULL) goto nospace; if (!m_dup_pkthdr(n, m, how)) goto nospace; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; mb_dupcl(n, m); } else { n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; while (m) { o = m_get(how, m->m_type); if (o == NULL) goto nospace; n->m_next = o; n = n->m_next; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; mb_dupcl(n, m); } else { bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; } return top; nospace: m_freem(top); return (NULL); } /* * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. */ void m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) { u_int count; KASSERT(off >= 0, ("m_copydata, negative off %d", off)); KASSERT(len >= 0, ("m_copydata, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); count = min(m->m_len - off, len); bcopy(mtod(m, caddr_t) + off, cp, count); len -= count; cp += count; off = 0; m = m->m_next; } } /* * Copy a packet header mbuf chain into a completely new chain, including * copying any mbuf clusters. Use this instead of m_copypacket() when * you need a writable copy of an mbuf chain. */ struct mbuf * m_dup(const struct mbuf *m, int how) { struct mbuf **p, *top = NULL; int remain, moff, nsize; MBUF_CHECKSLEEP(how); /* Sanity check */ if (m == NULL) return (NULL); M_ASSERTPKTHDR(m); /* While there's more data, get a new mbuf, tack it on, and fill it */ remain = m->m_pkthdr.len; moff = 0; p = ⊤ while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ struct mbuf *n; /* Get the next new mbuf */ if (remain >= MINCLSIZE) { n = m_getcl(how, m->m_type, 0); nsize = MCLBYTES; } else { n = m_get(how, m->m_type); nsize = MLEN; } if (n == NULL) goto nospace; if (top == NULL) { /* First one, must be PKTHDR */ if (!m_dup_pkthdr(n, m, how)) { m_free(n); goto nospace; } if ((n->m_flags & M_EXT) == 0) nsize = MHLEN; n->m_flags &= ~M_RDONLY; } n->m_len = 0; /* Link it into the new chain */ *p = n; p = &n->m_next; /* Copy data from original mbuf(s) into new mbuf */ while (n->m_len < nsize && m != NULL) { int chunk = min(nsize - n->m_len, m->m_len - moff); bcopy(m->m_data + moff, n->m_data + n->m_len, chunk); moff += chunk; n->m_len += chunk; remain -= chunk; if (moff == m->m_len) { m = m->m_next; moff = 0; } } /* Check correct total mbuf length */ KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), ("%s: bogus m_pkthdr.len", __func__)); } return (top); nospace: m_freem(top); return (NULL); } /* * Concatenate mbuf chain n to m. * Both chains must be of the same type (e.g. MT_DATA). * Any m_pkthdr is not updated. */ void m_cat(struct mbuf *m, struct mbuf *n) { while (m->m_next) m = m->m_next; while (n) { if (!M_WRITABLE(m) || M_TRAILINGSPACE(m) < n->m_len) { /* just join the two chains */ m->m_next = n; return; } /* splat the data from one into the other */ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)n->m_len); m->m_len += n->m_len; n = m_free(n); } } /* * Concatenate two pkthdr mbuf chains. */ void m_catpkt(struct mbuf *m, struct mbuf *n) { M_ASSERTPKTHDR(m); M_ASSERTPKTHDR(n); m->m_pkthdr.len += n->m_pkthdr.len; m_demote(n, 1, 0); m_cat(m, n); } void m_adj(struct mbuf *mp, int req_len) { int len = req_len; struct mbuf *m; int count; if ((m = mp) == NULL) return; if (len >= 0) { /* * Trim from head. */ while (m != NULL && len > 0) { if (m->m_len <= len) { len -= m->m_len; m->m_len = 0; m = m->m_next; } else { m->m_len -= len; m->m_data += len; len = 0; } } if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= (req_len - len); } else { /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ len = -len; count = 0; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len >= len) { m->m_len -= len; if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= len; return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ m = mp; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len = count; for (; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (m->m_next != NULL) { m_freem(m->m_next); m->m_next = NULL; } break; } count -= m->m_len; } } } /* * Rearange an mbuf chain so that len bytes are contiguous * and in the data area of an mbuf (so that mtod will work * for a structure of size len). Returns the resulting * mbuf chain on success, frees it and returns null on failure. * If there is room, it will add up to max_protohdr-len extra bytes to the * contiguous region in an attempt to avoid being called next time. */ struct mbuf * m_pullup(struct mbuf *n, int len) { struct mbuf *m; int count; int space; /* * If first mbuf has no cluster, and has room for len bytes * without shifting current data, pullup into it, * otherwise allocate a new mbuf to prepend to the chain. */ if ((n->m_flags & M_EXT) == 0 && n->m_data + len < &n->m_dat[MLEN] && n->m_next) { if (n->m_len >= len) return (n); m = n; n = n->m_next; len -= m->m_len; } else { if (len > MHLEN) goto bad; m = m_get(M_NOWAIT, n->m_type); if (m == NULL) goto bad; if (n->m_flags & M_PKTHDR) m_move_pkthdr(m, n); } space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); return (NULL); } /* * Like m_pullup(), except a new mbuf is always allocated, and we allow * the amount of empty space before the data in the new mbuf to be specified * (in the event that the caller expects to prepend later). */ struct mbuf * m_copyup(struct mbuf *n, int len, int dstoff) { struct mbuf *m; int count, space; if (len > (MHLEN - dstoff)) goto bad; m = m_get(M_NOWAIT, n->m_type); if (m == NULL) goto bad; if (n->m_flags & M_PKTHDR) m_move_pkthdr(m, n); m->m_data += dstoff; space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), (unsigned)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); return (NULL); } /* * Partition an mbuf chain in two pieces, returning the tail -- * all but the first len0 bytes. In case of failure, it returns NULL and * attempts to restore the chain to its original state. * * Note that the resulting mbufs might be read-only, because the new * mbuf can end up sharing an mbuf cluster with the original mbuf if * the "breaking point" happens to lie within a cluster mbuf. Use the * M_WRITABLE() macro to check for this case. */ struct mbuf * m_split(struct mbuf *m0, int len0, int wait) { struct mbuf *m, *n; u_int len = len0, remain; MBUF_CHECKSLEEP(wait); for (m = m0; m && len > m->m_len; m = m->m_next) len -= m->m_len; if (m == NULL) return (NULL); remain = m->m_len - len; if (m0->m_flags & M_PKTHDR && remain == 0) { n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); n->m_next = m->m_next; m->m_next = NULL; n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; return (n); } else if (m0->m_flags & M_PKTHDR) { n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; if (m->m_flags & M_EXT) goto extpacket; if (remain > MHLEN) { /* m can't be the lead packet */ M_ALIGN(n, 0); n->m_next = m_split(m, len, wait); if (n->m_next == NULL) { (void) m_free(n); return (NULL); } else { n->m_len = 0; return (n); } } else M_ALIGN(n, remain); } else if (remain == 0) { n = m->m_next; m->m_next = NULL; return (n); } else { n = m_get(wait, m->m_type); if (n == NULL) return (NULL); M_ALIGN(n, remain); } extpacket: if (m->m_flags & M_EXT) { n->m_data = m->m_data + len; mb_dupcl(n, m); } else { bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); } n->m_len = remain; m->m_len = len; n->m_next = m->m_next; m->m_next = NULL; return (n); } /* * Routine to copy from device local memory into mbufs. * Note that `off' argument is offset into first mbuf of target chain from * which to begin copying the data to. */ struct mbuf * m_devget(char *buf, int totlen, int off, struct ifnet *ifp, void (*copy)(char *from, caddr_t to, u_int len)) { struct mbuf *m; struct mbuf *top = NULL, **mp = ⊤ int len; if (off < 0 || off > MHLEN) return (NULL); while (totlen > 0) { if (top == NULL) { /* First one, must be PKTHDR */ if (totlen + off >= MINCLSIZE) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); len = MCLBYTES; } else { m = m_gethdr(M_NOWAIT, MT_DATA); len = MHLEN; /* Place initial small packet/header at end of mbuf */ if (m && totlen + off + max_linkhdr <= MLEN) { m->m_data += max_linkhdr; len -= max_linkhdr; } } if (m == NULL) return NULL; m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = totlen; } else { if (totlen + off >= MINCLSIZE) { m = m_getcl(M_NOWAIT, MT_DATA, 0); len = MCLBYTES; } else { m = m_get(M_NOWAIT, MT_DATA); len = MLEN; } if (m == NULL) { m_freem(top); return NULL; } } if (off) { m->m_data += off; len -= off; off = 0; } m->m_len = len = min(totlen, len); if (copy) copy(buf, mtod(m, caddr_t), (u_int)len); else bcopy(buf, mtod(m, caddr_t), (u_int)len); buf += len; *mp = m; mp = &m->m_next; totlen -= len; } return (top); } /* * Copy data from a buffer back into the indicated mbuf chain, * starting "off" bytes from the beginning, extending the mbuf * chain if necessary. */ void m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp) { int mlen; struct mbuf *m = m0, *n; int totlen = 0; if (m0 == NULL) return; while (off > (mlen = m->m_len)) { off -= mlen; totlen += mlen; if (m->m_next == NULL) { n = m_get(M_NOWAIT, m->m_type); if (n == NULL) goto out; bzero(mtod(n, caddr_t), MLEN); n->m_len = min(MLEN, len + off); m->m_next = n; } m = m->m_next; } while (len > 0) { if (m->m_next == NULL && (len > m->m_len - off)) { m->m_len += min(len - (m->m_len - off), M_TRAILINGSPACE(m)); } mlen = min (m->m_len - off, len); bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen); cp += mlen; len -= mlen; mlen += off; off = 0; totlen += mlen; if (len == 0) break; if (m->m_next == NULL) { n = m_get(M_NOWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, len); m->m_next = n; } m = m->m_next; } out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) m->m_pkthdr.len = totlen; } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * Return 1 if able to complete the job; otherwise 0. */ int m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space, remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_get(M_NOWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len, remainder -= n->m_len; m->m_next = n; m = n; } if (m0->m_flags & M_PKTHDR) m0->m_pkthdr.len += len - remainder; return (remainder == 0); } /* * Apply function f to the data in an mbuf chain starting "off" bytes from * the beginning, continuing for "len" bytes. */ int m_apply(struct mbuf *m, int off, int len, int (*f)(void *, void *, u_int), void *arg) { u_int count; int rval; KASSERT(off >= 0, ("m_apply, negative off %d", off)); KASSERT(len >= 0, ("m_apply, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); count = min(m->m_len - off, len); rval = (*f)(arg, mtod(m, caddr_t) + off, count); if (rval) return (rval); len -= count; off = 0; m = m->m_next; } return (0); } /* * Return a pointer to mbuf/offset of location in mbuf chain. */ struct mbuf * m_getptr(struct mbuf *m, int loc, int *off) { while (loc >= 0) { /* Normal end of search. */ if (m->m_len > loc) { *off = loc; return (m); } else { loc -= m->m_len; if (m->m_next == NULL) { if (loc == 0) { /* Point at the end of valid data. */ *off = m->m_len; return (m); } return (NULL); } m = m->m_next; } } return (NULL); } void m_print(const struct mbuf *m, int maxlen) { int len; int pdata; const struct mbuf *m2; if (m == NULL) { printf("mbuf: %p\n", m); return; } if (m->m_flags & M_PKTHDR) len = m->m_pkthdr.len; else len = -1; m2 = m; while (m2 != NULL && (len == -1 || len)) { pdata = m2->m_len; if (maxlen != -1 && pdata > maxlen) pdata = maxlen; printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len, m2->m_next, m2->m_flags, "\20\20freelist\17skipfw" "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly" "\3eor\2pkthdr\1ext", pdata ? "" : "\n"); if (pdata) printf(", %*D\n", pdata, (u_char *)m2->m_data, "-"); if (len != -1) len -= m2->m_len; m2 = m2->m_next; } if (len > 0) printf("%d bytes unaccounted for.\n", len); return; } u_int m_fixhdr(struct mbuf *m0) { u_int len; len = m_length(m0, NULL); m0->m_pkthdr.len = len; return (len); } u_int m_length(struct mbuf *m0, struct mbuf **last) { struct mbuf *m; u_int len; len = 0; for (m = m0; m != NULL; m = m->m_next) { len += m->m_len; if (m->m_next == NULL) break; } if (last != NULL) *last = m; return (len); } /* * Defragment a mbuf chain, returning the shortest possible * chain of mbufs and clusters. If allocation fails and * this cannot be completed, NULL will be returned, but * the passed in chain will be unchanged. Upon success, * the original chain will be freed, and the new chain * will be returned. * * If a non-packet header is passed in, the original * mbuf (chain?) will be returned unharmed. */ struct mbuf * m_defrag(struct mbuf *m0, int how) { struct mbuf *m_new = NULL, *m_final = NULL; int progress = 0, length; MBUF_CHECKSLEEP(how); if (!(m0->m_flags & M_PKTHDR)) return (m0); m_fixhdr(m0); /* Needed sanity check */ #ifdef MBUF_STRESS_TEST if (m_defragrandomfailures) { int temp = arc4random() & 0xff; if (temp == 0xba) goto nospace; } #endif if (m0->m_pkthdr.len > MHLEN) m_final = m_getcl(how, MT_DATA, M_PKTHDR); else m_final = m_gethdr(how, MT_DATA); if (m_final == NULL) goto nospace; if (m_dup_pkthdr(m_final, m0, how) == 0) goto nospace; m_new = m_final; while (progress < m0->m_pkthdr.len) { length = m0->m_pkthdr.len - progress; if (length > MCLBYTES) length = MCLBYTES; if (m_new == NULL) { if (length > MLEN) m_new = m_getcl(how, MT_DATA, 0); else m_new = m_get(how, MT_DATA); if (m_new == NULL) goto nospace; } m_copydata(m0, progress, length, mtod(m_new, caddr_t)); progress += length; m_new->m_len = length; if (m_new != m_final) m_cat(m_final, m_new); m_new = NULL; } #ifdef MBUF_STRESS_TEST if (m0->m_next == NULL) m_defraguseless++; #endif m_freem(m0); m0 = m_final; #ifdef MBUF_STRESS_TEST m_defragpackets++; m_defragbytes += m0->m_pkthdr.len; #endif return (m0); nospace: #ifdef MBUF_STRESS_TEST m_defragfailure++; #endif if (m_final) m_freem(m_final); return (NULL); } /* * Defragment an mbuf chain, returning at most maxfrags separate * mbufs+clusters. If this is not possible NULL is returned and * the original mbuf chain is left in it's present (potentially * modified) state. We use two techniques: collapsing consecutive * mbufs and replacing consecutive mbufs by a cluster. * * NB: this should really be named m_defrag but that name is taken */ struct mbuf * m_collapse(struct mbuf *m0, int how, int maxfrags) { struct mbuf *m, *n, *n2, **prev; u_int curfrags; /* * Calculate the current number of frags. */ curfrags = 0; for (m = m0; m != NULL; m = m->m_next) curfrags++; /* * First, try to collapse mbufs. Note that we always collapse * towards the front so we don't need to deal with moving the * pkthdr. This may be suboptimal if the first mbuf has much * less data than the following. */ m = m0; again: for (;;) { n = m->m_next; if (n == NULL) break; if (M_WRITABLE(m) && n->m_len < M_TRAILINGSPACE(m)) { bcopy(mtod(n, void *), mtod(m, char *) + m->m_len, n->m_len); m->m_len += n->m_len; m->m_next = n->m_next; m_free(n); if (--curfrags <= maxfrags) return m0; } else m = n; } KASSERT(maxfrags > 1, ("maxfrags %u, but normal collapse failed", maxfrags)); /* * Collapse consecutive mbufs to a cluster. */ prev = &m0->m_next; /* NB: not the first mbuf */ while ((n = *prev) != NULL) { if ((n2 = n->m_next) != NULL && n->m_len + n2->m_len < MCLBYTES) { m = m_getcl(how, MT_DATA, 0); if (m == NULL) goto bad; bcopy(mtod(n, void *), mtod(m, void *), n->m_len); bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len, n2->m_len); m->m_len = n->m_len + n2->m_len; m->m_next = n2->m_next; *prev = m; m_free(n); m_free(n2); if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */ return m0; /* * Still not there, try the normal collapse * again before we allocate another cluster. */ goto again; } prev = &n->m_next; } /* * No place where we can collapse to a cluster; punt. * This can occur if, for example, you request 2 frags * but the packet requires that both be clusters (we * never reallocate the first mbuf to avoid moving the * packet header). */ bad: return NULL; } #ifdef MBUF_STRESS_TEST /* * Fragment an mbuf chain. There's no reason you'd ever want to do * this in normal usage, but it's great for stress testing various * mbuf consumers. * * If fragmentation is not possible, the original chain will be * returned. * * Possible length values: * 0 no fragmentation will occur * > 0 each fragment will be of the specified length * -1 each fragment will be the same random value in length * -2 each fragment's length will be entirely random * (Random values range from 1 to 256) */ struct mbuf * m_fragment(struct mbuf *m0, int how, int length) { struct mbuf *m_new = NULL, *m_final = NULL; int progress = 0; if (!(m0->m_flags & M_PKTHDR)) return (m0); if ((length == 0) || (length < -2)) return (m0); m_fixhdr(m0); /* Needed sanity check */ m_final = m_getcl(how, MT_DATA, M_PKTHDR); if (m_final == NULL) goto nospace; if (m_dup_pkthdr(m_final, m0, how) == 0) goto nospace; m_new = m_final; if (length == -1) length = 1 + (arc4random() & 255); while (progress < m0->m_pkthdr.len) { int fraglen; if (length > 0) fraglen = length; else fraglen = 1 + (arc4random() & 255); if (fraglen > m0->m_pkthdr.len - progress) fraglen = m0->m_pkthdr.len - progress; if (fraglen > MCLBYTES) fraglen = MCLBYTES; if (m_new == NULL) { m_new = m_getcl(how, MT_DATA, 0); if (m_new == NULL) goto nospace; } m_copydata(m0, progress, fraglen, mtod(m_new, caddr_t)); progress += fraglen; m_new->m_len = fraglen; if (m_new != m_final) m_cat(m_final, m_new); m_new = NULL; } m_freem(m0); m0 = m_final; return (m0); nospace: if (m_final) m_freem(m_final); /* Return the original chain on failure */ return (m0); } #endif /* * Copy the contents of uio into a properly sized mbuf chain. */ struct mbuf * m_uiotombuf(struct uio *uio, int how, int len, int align, int flags) { struct mbuf *m, *mb; int error, length; ssize_t total; int progress = 0; /* * len can be zero or an arbitrary large value bound by * the total data supplied by the uio. */ if (len > 0) total = min(uio->uio_resid, len); else total = uio->uio_resid; /* * The smallest unit returned by m_getm2() is a single mbuf * with pkthdr. We can't align past it. */ if (align >= MHLEN) return (NULL); /* * Give us the full allocation or nothing. * If len is zero return the smallest empty mbuf. */ m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags); if (m == NULL) return (NULL); m->m_data += align; /* Fill all mbufs with uio data and update header information. */ for (mb = m; mb != NULL; mb = mb->m_next) { length = min(M_TRAILINGSPACE(mb), total - progress); error = uiomove(mtod(mb, void *), length, uio); if (error) { m_freem(m); return (NULL); } mb->m_len = length; progress += length; if (flags & M_PKTHDR) m->m_pkthdr.len += length; } KASSERT(progress == total, ("%s: progress != total", __func__)); return (m); } /* * Copy an mbuf chain into a uio limited by len if set. */ int m_mbuftouio(struct uio *uio, struct mbuf *m, int len) { int error, length, total; int progress = 0; if (len > 0) total = min(uio->uio_resid, len); else total = uio->uio_resid; /* Fill the uio with data from the mbufs. */ for (; m != NULL; m = m->m_next) { length = min(m->m_len, total - progress); error = uiomove(mtod(m, void *), length, uio); if (error) return (error); progress += length; } return (0); } /* * Create a writable copy of the mbuf chain. While doing this * we compact the chain with a goal of producing a chain with * at most two mbufs. The second mbuf in this chain is likely * to be a cluster. The primary purpose of this work is to create * a writable packet for encryption, compression, etc. The * secondary goal is to linearize the data so the data can be * passed to crypto hardware in the most efficient manner possible. */ struct mbuf * m_unshare(struct mbuf *m0, int how) { struct mbuf *m, *mprev; struct mbuf *n, *mfirst, *mlast; int len, off; mprev = NULL; for (m = m0; m != NULL; m = mprev->m_next) { /* * Regular mbufs are ignored unless there's a cluster * in front of it that we can use to coalesce. We do * the latter mainly so later clusters can be coalesced * also w/o having to handle them specially (i.e. convert * mbuf+cluster -> cluster). This optimization is heavily * influenced by the assumption that we're running over * Ethernet where MCLBYTES is large enough that the max * packet size will permit lots of coalescing into a * single cluster. This in turn permits efficient * crypto operations, especially when using hardware. */ if ((m->m_flags & M_EXT) == 0) { if (mprev && (mprev->m_flags & M_EXT) && m->m_len <= M_TRAILINGSPACE(mprev)) { /* XXX: this ignores mbuf types */ memcpy(mtod(mprev, caddr_t) + mprev->m_len, mtod(m, caddr_t), m->m_len); mprev->m_len += m->m_len; mprev->m_next = m->m_next; /* unlink from chain */ m_free(m); /* reclaim mbuf */ #if 0 newipsecstat.ips_mbcoalesced++; #endif } else { mprev = m; } continue; } /* * Writable mbufs are left alone (for now). */ if (M_WRITABLE(m)) { mprev = m; continue; } /* * Not writable, replace with a copy or coalesce with * the previous mbuf if possible (since we have to copy * it anyway, we try to reduce the number of mbufs and * clusters so that future work is easier). */ KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags)); /* NB: we only coalesce into a cluster or larger */ if (mprev != NULL && (mprev->m_flags & M_EXT) && m->m_len <= M_TRAILINGSPACE(mprev)) { /* XXX: this ignores mbuf types */ memcpy(mtod(mprev, caddr_t) + mprev->m_len, mtod(m, caddr_t), m->m_len); mprev->m_len += m->m_len; mprev->m_next = m->m_next; /* unlink from chain */ m_free(m); /* reclaim mbuf */ #if 0 newipsecstat.ips_clcoalesced++; #endif continue; } /* * Allocate new space to hold the copy and copy the data. * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by * splitting them into clusters. We could just malloc a * buffer and make it external but too many device drivers * don't know how to break up the non-contiguous memory when * doing DMA. */ n = m_getcl(how, m->m_type, m->m_flags); if (n == NULL) { m_freem(m0); return (NULL); } if (m->m_flags & M_PKTHDR) { KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR", __func__, m0, m)); m_move_pkthdr(n, m); } len = m->m_len; off = 0; mfirst = n; mlast = NULL; for (;;) { int cc = min(len, MCLBYTES); memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc); n->m_len = cc; if (mlast != NULL) mlast->m_next = n; mlast = n; #if 0 newipsecstat.ips_clcopied++; #endif len -= cc; if (len <= 0) break; off += cc; n = m_getcl(how, m->m_type, m->m_flags); if (n == NULL) { m_freem(mfirst); m_freem(m0); return (NULL); } } n->m_next = m->m_next; if (mprev == NULL) m0 = mfirst; /* new head of chain */ else mprev->m_next = mfirst; /* replace old mbuf */ m_free(m); /* release old mbuf */ mprev = mfirst; } return (m0); } #ifdef MBUF_PROFILING #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/ struct mbufprofile { uintmax_t wasted[MP_BUCKETS]; uintmax_t used[MP_BUCKETS]; uintmax_t segments[MP_BUCKETS]; } mbprof; #define MP_MAXDIGITS 21 /* strlen("16,000,000,000,000,000,000") == 21 */ #define MP_NUMLINES 6 #define MP_NUMSPERLINE 16 #define MP_EXTRABYTES 64 /* > strlen("used:\nwasted:\nsegments:\n") */ /* work out max space needed and add a bit of spare space too */ #define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE) #define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES) char mbprofbuf[MP_BUFSIZE]; void m_profile(struct mbuf *m) { int segments = 0; int used = 0; int wasted = 0; while (m) { segments++; used += m->m_len; if (m->m_flags & M_EXT) { wasted += MHLEN - sizeof(m->m_ext) + m->m_ext.ext_size - m->m_len; } else { if (m->m_flags & M_PKTHDR) wasted += MHLEN - m->m_len; else wasted += MLEN - m->m_len; } m = m->m_next; } /* be paranoid.. it helps */ if (segments > MP_BUCKETS - 1) segments = MP_BUCKETS - 1; if (used > 100000) used = 100000; if (wasted > 100000) wasted = 100000; /* store in the appropriate bucket */ /* don't bother locking. if it's slightly off, so what? */ mbprof.segments[segments]++; mbprof.used[fls(used)]++; mbprof.wasted[fls(wasted)]++; } static void mbprof_textify(void) { int offset; char *c; uint64_t *p; p = &mbprof.wasted[0]; c = mbprofbuf; offset = snprintf(c, MP_MAXLINE + 10, "wasted:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.wasted[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif p = &mbprof.used[0]; c += offset; offset = snprintf(c, MP_MAXLINE + 10, "used:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.used[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif p = &mbprof.segments[0]; c += offset; offset = snprintf(c, MP_MAXLINE + 10, "segments:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.segments[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %jju", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif } static int mbprof_handler(SYSCTL_HANDLER_ARGS) { int error; mbprof_textify(); error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1); return (error); } static int mbprof_clr_handler(SYSCTL_HANDLER_ARGS) { int clear, error; clear = 0; error = sysctl_handle_int(oidp, &clear, 0, req); if (error || !req->newptr) return (error); if (clear) { bzero(&mbprof, sizeof(mbprof)); } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, mbprof_handler, "A", "mbuf profiling statistics"); SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, CTLTYPE_INT|CTLFLAG_RW, NULL, 0, mbprof_clr_handler, "I", "clear mbuf profiling statistics"); #endif Index: head/sys/kern/uipc_syscalls.c =================================================================== --- head/sys/kern/uipc_syscalls.c (revision 293438) +++ head/sys/kern/uipc_syscalls.c (revision 293439) @@ -1,2570 +1,2748 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * sendfile(2) and related extensions: * Copyright (c) 1998, David Greenman. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include #include #include #include #include #include #include #include #include /* * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC * and SOCK_NONBLOCK. */ #define ACCEPT4_INHERIT 0x1 #define ACCEPT4_COMPAT 0x2 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); static int accept1(struct thread *td, int s, struct sockaddr *uname, socklen_t *anamelen, int flags); static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat); static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat); static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat); counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; -/* - * sendfile(2)-related variables and associated sysctls - */ -static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0, - "sendfile(2) tunables"); -static int sfreadahead = 1; -SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW, - &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks"); - static void sfstat_init(const void *unused) { COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), M_WAITOK); } SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); static int sfstat_sysctl(SYSCTL_HANDLER_ARGS) { struct sfstat s; COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); if (req->newptr) COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); return (SYSCTL_OUT(req, &s, sizeof(s))); } SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); /* * Convert a user file descriptor to a kernel file entry and check if required * capability rights are present. * A reference on the file entry is held upon returning. */ int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp, u_int *fflagp) { struct file *fp; int error; error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); return (ENOTSOCK); } if (fflagp != NULL) *fflagp = fp->f_flag; *fpp = fp; return (0); } /* * System call interface to the socket abstraction. */ #if defined(COMPAT_43) #define COMPAT_OLDSOCK #endif int sys_socket(td, uap) struct thread *td; struct socket_args /* { int domain; int type; int protocol; } */ *uap; { struct socket *so; struct file *fp; int fd, error, type, oflag, fflag; AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); type = uap->type; oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC error = mac_socket_check_create(td->td_ucred, uap->domain, type, uap->protocol); if (error != 0) return (error); #endif error = falloc(td, &fp, &fd, oflag); if (error != 0) return (error); /* An extra reference on `fp' has been held for us by falloc(). */ error = socreate(uap->domain, &so, type, uap->protocol, td->td_ucred, td); if (error != 0) { fdclose(td, fp, fd); } else { finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); if ((fflag & FNONBLOCK) != 0) (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); td->td_retval[0] = fd; } fdrop(fp, td); return (error); } /* ARGSUSED */ int sys_bind(td, uap) struct thread *td; struct bind_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND), &fp, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_bind(td->td_ucred, so, sa); if (error == 0) { #endif if (dirfd == AT_FDCWD) error = sobind(so, sa, td); else error = sobindat(dirfd, so, sa, td); #ifdef MAC } #endif fdrop(fp, td); return (error); } /* ARGSUSED */ int sys_bindat(td, uap) struct thread *td; struct bindat_args /* { int fd; int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } /* ARGSUSED */ int sys_listen(td, uap) struct thread *td; struct listen_args /* { int s; int backlog; } */ *uap; { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->s); error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_LISTEN), &fp, NULL); if (error == 0) { so = fp->f_data; #ifdef MAC error = mac_socket_check_listen(td->td_ucred, so); if (error == 0) #endif error = solisten(so, uap->backlog, td); fdrop(fp, td); } return(error); } /* * accept1() */ static int accept1(td, s, uname, anamelen, flags) struct thread *td; int s; struct sockaddr *uname; socklen_t *anamelen; int flags; { struct sockaddr *name; socklen_t namelen; struct file *fp; int error; if (uname == NULL) return (kern_accept4(td, s, NULL, NULL, flags, NULL)); error = copyin(anamelen, &namelen, sizeof (namelen)); if (error != 0) return (error); error = kern_accept4(td, s, &name, &namelen, flags, &fp); if (error != 0) return (error); if (error == 0 && uname != NULL) { #ifdef COMPAT_OLDSOCK if (flags & ACCEPT4_COMPAT) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif error = copyout(name, uname, namelen); } if (error == 0) error = copyout(&namelen, anamelen, sizeof(namelen)); if (error != 0) fdclose(td, fp, td->td_retval[0]); fdrop(fp, td); free(name, M_SONAME); return (error); } int kern_accept(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, struct file **fp) { return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); } int kern_accept4(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, int flags, struct file **fp) { struct file *headfp, *nfp = NULL; struct sockaddr *sa = NULL; struct socket *head, *so; cap_rights_t rights; u_int fflag; pid_t pgid; int error, fd, tmp; if (name != NULL) *name = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT), &headfp, &fflag); if (error != 0) return (error); head = headfp->f_data; if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(td->td_ucred, head); if (error != 0) goto done; #endif error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); if (error != 0) goto done; ACCEPT_LOCK(); if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { ACCEPT_UNLOCK(); error = EWOULDBLOCK; goto noconnection; } while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { head->so_error = ECONNABORTED; break; } error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, "accept", 0); if (error != 0) { ACCEPT_UNLOCK(); goto noconnection; } } if (head->so_error) { error = head->so_error; head->so_error = 0; ACCEPT_UNLOCK(); goto noconnection; } so = TAILQ_FIRST(&head->so_comp); KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ SOCK_LOCK(so); /* soref() and so_state update */ soref(so); /* file descriptor reference */ TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; so->so_qstate &= ~SQ_COMP; so->so_head = NULL; SOCK_UNLOCK(so); ACCEPT_UNLOCK(); /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; /* connection has been removed from the listen queue */ KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); if (flags & ACCEPT4_INHERIT) { pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); } else { fflag &= ~(FNONBLOCK | FASYNC); if (flags & SOCK_NONBLOCK) fflag |= FNONBLOCK; } finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); /* Sync socket nonblocking/async state with file flags */ tmp = fflag & FNONBLOCK; (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); sa = 0; error = soaccept(so, &sa); if (error != 0) goto noconnection; if (sa == NULL) { if (name) *namelen = 0; goto done; } AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); if (name) { /* check sa_len before it is destroyed */ if (*namelen > sa->sa_len) *namelen = sa->sa_len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif *name = sa; sa = NULL; } noconnection: free(sa, M_SONAME); /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error != 0) fdclose(td, nfp, fd); /* * Release explicitly held references before returning. We return * a reference on nfp to the caller on success if they request it. */ done: if (fp != NULL) { if (error == 0) { *fp = nfp; nfp = NULL; } else *fp = NULL; } if (nfp != NULL) fdrop(nfp, td); fdrop(headfp, td); return (error); } int sys_accept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); } int sys_accept4(td, uap) struct thread *td; struct accept4_args *uap; { if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return (EINVAL); return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); } #ifdef COMPAT_OLDSOCK int oaccept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT | ACCEPT4_COMPAT)); } #endif /* COMPAT_OLDSOCK */ /* ARGSUSED */ int sys_connect(td, uap) struct thread *td; struct connect_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; cap_rights_t rights; int error, interrupted = 0; AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT), &fp, NULL); if (error != 0) return (error); so = fp->f_data; if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_connect(td->td_ucred, so, sa); if (error != 0) goto bad; #endif if (dirfd == AT_FDCWD) error = soconnect(so, sa, td); else error = soconnectat(dirfd, so, sa, td); if (error != 0) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, "connec", 0); if (error != 0) { if (error == EINTR || error == ERESTART) interrupted = 1; break; } } if (error == 0) { error = so->so_error; so->so_error = 0; } SOCK_UNLOCK(so); bad: if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) error = EINTR; done1: fdrop(fp, td); return (error); } /* ARGSUSED */ int sys_connectat(td, uap) struct thread *td; struct connectat_args /* { int fd; int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_socketpair(struct thread *td, int domain, int type, int protocol, int *rsv) { struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC /* We might want to have a separate check for socket pairs. */ error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = socreate(domain, &so1, type, protocol, td->td_ucred, td); if (error != 0) return (error); error = socreate(domain, &so2, type, protocol, td->td_ucred, td); if (error != 0) goto free1; /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ error = falloc(td, &fp1, &fd, oflag); if (error != 0) goto free2; rsv[0] = fd; fp1->f_data = so1; /* so1 already has ref count */ error = falloc(td, &fp2, &fd, oflag); if (error != 0) goto free3; fp2->f_data = so2; /* so2 already has ref count */ rsv[1] = fd; error = soconnect2(so1, so2); if (error != 0) goto free4; if (type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error != 0) goto free4; } finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, &socketops); finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, &socketops); if ((fflag & FNONBLOCK) != 0) { (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); } fdrop(fp1, td); fdrop(fp2, td); return (0); free4: fdclose(td, fp2, rsv[1]); fdrop(fp2, td); free3: fdclose(td, fp1, rsv[0]); fdrop(fp1, td); free2: if (so2 != NULL) (void)soclose(so2); free1: if (so1 != NULL) (void)soclose(so1); return (error); } int sys_socketpair(struct thread *td, struct socketpair_args *uap) { int error, sv[2]; error = kern_socketpair(td, uap->domain, uap->type, uap->protocol, sv); if (error != 0) return (error); error = copyout(sv, uap->rsv, 2 * sizeof(int)); if (error != 0) { (void)kern_close(td, sv[0]); (void)kern_close(td, sv[1]); } return (error); } static int sendit(td, s, mp, flags) struct thread *td; int s; struct msghdr *mp; int flags; { struct mbuf *control; struct sockaddr *to; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) return (ECAPMODE); #endif if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error != 0) { to = NULL; goto bad; } mp->msg_name = to; } else { to = NULL; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK && mp->msg_flags != MSG_COMPAT #endif ) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error != 0) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT) { struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_WAITOK); cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; cm->cmsg_type = SCM_RIGHTS; } #endif } else { control = NULL; } error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); bad: free(to, M_SONAME); return (error); } int kern_sendit(td, s, mp, flags, control, segflg) struct thread *td; int s; struct msghdr *mp; int flags; struct mbuf *control; enum uio_seg segflg; { struct file *fp; struct uio auio; struct iovec *iov; struct socket *so; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int i, error; AUDIT_ARG_FD(s); cap_rights_init(&rights, CAP_SEND); if (mp->msg_name != NULL) { AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); cap_rights_set(&rights, CAP_CONNECT); } error = getsock_cap(td, s, &rights, &fp, NULL); if (error != 0) return (error); so = (struct socket *)fp->f_data; #ifdef KTRACE if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(mp->msg_name); #endif #ifdef MAC if (mp->msg_name != NULL) { error = mac_socket_check_connect(td->td_ucred, so, mp->msg_name); if (error != 0) goto bad; } error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto bad; #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = segflg; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; goto bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(s, UIO_WRITE, ktruio, error); } #endif bad: fdrop(fp, td); return (error); } int sys_sendto(td, uap) struct thread *td; struct sendto_args /* { int s; caddr_t buf; size_t len; int flags; caddr_t to; int tolen; } */ *uap; { struct msghdr msg; struct iovec aiov; msg.msg_name = uap->to; msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif aiov.iov_base = uap->buf; aiov.iov_len = uap->len; return (sendit(td, uap->s, &msg, uap->flags)); } #ifdef COMPAT_OLDSOCK int osend(td, uap) struct thread *td; struct osend_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = 0; return (sendit(td, uap->s, &msg, uap->flags)); } int osendmsg(td, uap) struct thread *td; struct osendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; msg.msg_flags = MSG_COMPAT; error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } #endif int sys_sendmsg(td, uap) struct thread *td; struct sendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } int kern_recvit(td, s, mp, fromseg, controlp) struct thread *td; int s; struct msghdr *mp; enum uio_seg fromseg; struct mbuf **controlp; { struct uio auio; struct iovec *iov; struct mbuf *m, *control = NULL; caddr_t ctlbuf; struct file *fp; struct socket *so; struct sockaddr *fromsa = NULL; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int error, i; if (controlp != NULL) *controlp = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV), &fp, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef MAC error = mac_socket_check_receive(td->td_ucred, so); if (error != 0) { fdrop(fp, td); return (error); } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { fdrop(fp, td); return (EINVAL); } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = soreceive(so, &fromsa, &auio, NULL, (mp->msg_control || controlp) ? &control : NULL, &mp->msg_flags); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } if (fromsa != NULL) AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = len - auio.uio_resid; ktrgenio(s, UIO_READ, ktruio, error); } #endif if (error != 0) goto out; td->td_retval[0] = len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == NULL) len = 0; else { /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif if (fromseg == UIO_USERSPACE) { error = copyout(fromsa, mp->msg_name, (unsigned)len); if (error != 0) goto out; } else bcopy(fromsa, mp->msg_name, len); } mp->msg_namelen = len; } if (mp->msg_control && controlp == NULL) { #ifdef COMPAT_OLDSOCK /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info * is always optional and those options didn't exist in 4.3. * If we receive rights, trim the cmsghdr; anything else * is tossed. */ if (control && mp->msg_flags & MSG_COMPAT) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != SCM_RIGHTS) { mp->msg_controllen = 0; goto out; } control->m_len -= sizeof (struct cmsghdr); control->m_data += sizeof (struct cmsghdr); } #endif len = mp->msg_controllen; m = control; mp->msg_controllen = 0; ctlbuf = mp->msg_control; while (m && len > 0) { unsigned int tocopy; if (len >= m->m_len) tocopy = m->m_len; else { mp->msg_flags |= MSG_CTRUNC; tocopy = len; } if ((error = copyout(mtod(m, caddr_t), ctlbuf, tocopy)) != 0) goto out; ctlbuf += tocopy; len -= tocopy; m = m->m_next; } mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; } out: fdrop(fp, td); #ifdef KTRACE if (fromsa && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(fromsa); #endif free(fromsa, M_SONAME); if (error == 0 && controlp != NULL) *controlp = control; else if (control) m_freem(control); return (error); } static int recvit(td, s, mp, namelenp) struct thread *td; int s; struct msghdr *mp; void *namelenp; { int error; error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); if (error != 0) return (error); if (namelenp != NULL) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) error = 0; /* old recvfrom didn't check */ #endif } return (error); } int sys_recvfrom(td, uap) struct thread *td; struct recvfrom_args /* { int s; caddr_t buf; size_t len; int flags; struct sockaddr * __restrict from; socklen_t * __restrict fromlenaddr; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &msg.msg_namelen, sizeof (msg.msg_namelen)); if (error != 0) goto done2; } else { msg.msg_namelen = 0; } msg.msg_name = uap->from; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, uap->fromlenaddr); done2: return (error); } #ifdef COMPAT_OLDSOCK int orecvfrom(td, uap) struct thread *td; struct recvfrom_args *uap; { uap->flags |= MSG_COMPAT; return (sys_recvfrom(td, uap)); } #endif #ifdef COMPAT_OLDSOCK int orecv(td, uap) struct thread *td; struct orecv_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; return (recvit(td, uap->s, &msg, NULL)); } /* * Old recvmsg. This code takes advantage of the fact that the old msghdr * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. */ int orecvmsg(td, uap) struct thread *td; struct orecvmsg_args /* { int s; struct omsghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags | MSG_COMPAT; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); if (msg.msg_controllen && error == 0) error = copyout(&msg.msg_controllen, &uap->msg->msg_accrightslen, sizeof (int)); free(iov, M_IOV); return (error); } #endif int sys_recvmsg(td, uap) struct thread *td; struct recvmsg_args /* { int s; struct msghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *uiov, *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, NULL); if (error == 0) { msg.msg_iov = uiov; error = copyout(&msg, uap->msg, sizeof(msg)); } free(iov, M_IOV); return (error); } /* ARGSUSED */ int sys_shutdown(td, uap) struct thread *td; struct shutdown_args /* { int s; int how; } */ *uap; { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->s); error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL); if (error == 0) { so = fp->f_data; error = soshutdown(so, uap->how); /* * Previous versions did not return ENOTCONN, but 0 in * case the socket was not connected. Some important * programs like syslogd up to r279016, 2015-02-19, * still depend on this behavior. */ if (error == ENOTCONN && td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN) error = 0; fdrop(fp, td); } return (error); } /* ARGSUSED */ int sys_setsockopt(td, uap) struct thread *td; struct setsockopt_args /* { int s; int level; int name; caddr_t val; int valsize; } */ *uap; { return (kern_setsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, uap->valsize)); } int kern_setsockopt(td, s, level, name, val, valseg, valsize) struct thread *td; int s; int level; int name; void *val; enum uio_seg valseg; socklen_t valsize; { struct socket *so; struct file *fp; struct sockopt sopt; cap_rights_t rights; int error; if (val == NULL && valsize != 0) return (EFAULT); if ((int)valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = valsize; switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_setsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL); if (error == 0) { so = fp->f_data; error = sosetopt(so, &sopt); fdrop(fp, td); } return(error); } /* ARGSUSED */ int sys_getsockopt(td, uap) struct thread *td; struct getsockopt_args /* { int s; int level; int name; void * __restrict val; socklen_t * __restrict avalsize; } */ *uap; { socklen_t valsize; int error; if (uap->val) { error = copyin(uap->avalsize, &valsize, sizeof (valsize)); if (error != 0) return (error); } error = kern_getsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, &valsize); if (error == 0) error = copyout(&valsize, uap->avalsize, sizeof (valsize)); return (error); } /* * Kernel version of getsockopt. * optval can be a userland or userspace. optlen is always a kernel pointer. */ int kern_getsockopt(td, s, level, name, val, valseg, valsize) struct thread *td; int s; int level; int name; void *val; enum uio_seg valseg; socklen_t *valsize; { struct socket *so; struct file *fp; struct sockopt sopt; cap_rights_t rights; int error; if (val == NULL) *valsize = 0; if ((int)*valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_getsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL); if (error == 0) { so = fp->f_data; error = sogetopt(so, &sopt); *valsize = sopt.sopt_valsize; fdrop(fp, td); } return (error); } /* * getsockname1() - Get socket name. */ /* ARGSUSED */ static int getsockname1(td, uap, compat) struct thread *td; struct getsockname_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ *uap; int compat; { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof(len)); if (error != 0) return (error); error = kern_getsockname(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; cap_rights_t rights; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL); if (error != 0) return (error); so = fp->f_data; *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: fdrop(fp, td); if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } return (error); } int sys_getsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * getpeername1() - Get name of peer for connected socket. */ /* ARGSUSED */ static int getpeername1(td, uap, compat) struct thread *td; struct getpeername_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ *uap; int compat; { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof (len)); if (error != 0) return (error); error = kern_getpeername(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; cap_rights_t rights; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL); if (error != 0) return (error); so = fp->f_data; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done; } *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } done: fdrop(fp, td); return (error); } int sys_getpeername(td, uap) struct thread *td; struct getpeername_args *uap; { return (getpeername1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetpeername(td, uap) struct thread *td; struct ogetpeername_args *uap; { /* XXX uap should have type `getpeername_args *' to begin with. */ return (getpeername1(td, (struct getpeername_args *)uap, 1)); } #endif /* COMPAT_OLDSOCK */ int sockargs(mp, buf, buflen, type) struct mbuf **mp; caddr_t buf; int buflen, type; { struct sockaddr *sa; struct mbuf *m; int error; if (buflen > MLEN) { #ifdef COMPAT_OLDSOCK if (type == MT_SONAME && buflen <= 112) buflen = MLEN; /* unix domain compat. hack */ else #endif if (buflen > MCLBYTES) return (EINVAL); } m = m_get2(buflen, M_WAITOK, type, 0); m->m_len = buflen; error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); if (error != 0) (void) m_free(m); else { *mp = m; if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; } } return (error); } int getsockaddr(namp, uaddr, len) struct sockaddr **namp; caddr_t uaddr; size_t len; { struct sockaddr *sa; int error; if (len > SOCK_MAXADDRLEN) return (ENAMETOOLONG); if (len < offsetof(struct sockaddr, sa_data[0])) return (EINVAL); sa = malloc(len, M_SONAME, M_WAITOK); error = copyin(uaddr, sa, len); if (error != 0) { free(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; *namp = sa; } return (error); } struct sendfile_sync { struct mtx mtx; struct cv cv; unsigned count; }; /* * Add more references to a vm_page + sf_buf + sendfile_sync. */ void sf_ext_ref(void *arg1, void *arg2) { struct sf_buf *sf = arg1; struct sendfile_sync *sfs = arg2; vm_page_t pg = sf_buf_page(sf); sf_buf_ref(sf); vm_page_lock(pg); vm_page_wire(pg); vm_page_unlock(pg); if (sfs != NULL) { mtx_lock(&sfs->mtx); KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); sfs->count++; mtx_unlock(&sfs->mtx); } } /* * Detach mapped page and release resources back to the system. */ void sf_ext_free(void *arg1, void *arg2) { struct sf_buf *sf = arg1; struct sendfile_sync *sfs = arg2; vm_page_t pg = sf_buf_page(sf); sf_buf_free(sf); vm_page_lock(pg); - vm_page_unwire(pg, PQ_INACTIVE); /* * Check for the object going away on us. This can * happen since we don't hold a reference to it. * If so, we're responsible for freeing the page. */ - if (pg->wire_count == 0 && pg->object == NULL) + if (vm_page_unwire(pg, PQ_INACTIVE) && pg->object == NULL) vm_page_free(pg); vm_page_unlock(pg); if (sfs != NULL) { mtx_lock(&sfs->mtx); KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); if (--sfs->count == 0) cv_signal(&sfs->cv); mtx_unlock(&sfs->mtx); } } /* + * Same as above, but forces the page to be detached from the object + * and go into free pool. + */ +void +sf_ext_free_nocache(void *arg1, void *arg2) +{ + struct sf_buf *sf = arg1; + struct sendfile_sync *sfs = arg2; + vm_page_t pg = sf_buf_page(sf); + + sf_buf_free(sf); + + vm_page_lock(pg); + if (vm_page_unwire(pg, PQ_NONE)) { + vm_object_t obj; + + /* Try to free the page, but only if it is cheap to. */ + if ((obj = pg->object) == NULL) + vm_page_free(pg); + else if (!vm_page_xbusied(pg) && VM_OBJECT_TRYWLOCK(obj)) { + vm_page_free(pg); + VM_OBJECT_WUNLOCK(obj); + } else + vm_page_deactivate(pg); + } + vm_page_unlock(pg); + + if (sfs != NULL) { + mtx_lock(&sfs->mtx); + KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); + if (--sfs->count == 0) + cv_signal(&sfs->cv); + mtx_unlock(&sfs->mtx); + } +} + +/* * sendfile(2) * * int sendfile(int fd, int s, off_t offset, size_t nbytes, * struct sf_hdtr *hdtr, off_t *sbytes, int flags) * * Send a file specified by 'fd' and starting at 'offset' to a socket * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == * 0. Optionally add a header and/or trailer to the socket output. If * specified, write the total number of bytes sent into *sbytes. */ int sys_sendfile(struct thread *td, struct sendfile_args *uap) { return (do_sendfile(td, uap, 0)); } static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) { struct sf_hdtr hdtr; struct uio *hdr_uio, *trl_uio; struct file *fp; cap_rights_t rights; off_t sbytes; int error; /* * File offset must be positive. If it goes beyond EOF * we send only the header/trailer and no payload data. */ if (uap->offset < 0) return (EINVAL); hdr_uio = trl_uio = NULL; if (uap->hdtr != NULL) { error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); if (error != 0) goto out; if (hdtr.headers != NULL) { error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); if (error != 0) goto out; } if (hdtr.trailers != NULL) { error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); if (error != 0) goto out; } } AUDIT_ARG_FD(uap->fd); /* * sendfile(2) can start at any offset within a file so we require * CAP_READ+CAP_SEEK = CAP_PREAD. */ if ((error = fget_read(td, uap->fd, cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { goto out; } error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset, uap->nbytes, &sbytes, uap->flags, compat ? SFK_COMPAT : 0, td); fdrop(fp, td); if (uap->sbytes != NULL) copyout(&sbytes, uap->sbytes, sizeof(off_t)); out: free(hdr_uio, M_IOV); free(trl_uio, M_IOV); return (error); } #ifdef COMPAT_FREEBSD4 int freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) { struct sendfile_args args; args.fd = uap->fd; args.s = uap->s; args.offset = uap->offset; args.nbytes = uap->nbytes; args.hdtr = uap->hdtr; args.sbytes = uap->sbytes; args.flags = uap->flags; return (do_sendfile(td, &args, 1)); } #endif /* COMPAT_FREEBSD4 */ + /* + * How much data to put into page i of n. + * Only first and last pages are special. + */ +static inline off_t +xfsize(int i, int n, off_t off, off_t len) +{ + + if (i == 0) + return (omin(PAGE_SIZE - (off & PAGE_MASK), len)); + + if (i == n - 1 && ((off + len) & PAGE_MASK) > 0) + return ((off + len) & PAGE_MASK); + + return (PAGE_SIZE); +} + +/* + * Offset within object for i page. + */ +static inline vm_offset_t +vmoff(int i, off_t off) +{ + + if (i == 0) + return ((vm_offset_t)off); + + return (trunc_page(off + i * PAGE_SIZE)); +} + +/* + * Pretend as if we don't have enough space, subtract xfsize() of + * all pages that failed. + */ +static inline void +fixspace(int old, int new, off_t off, int *space) +{ + + KASSERT(old > new, ("%s: old %d new %d", __func__, old, new)); + + /* Subtract last one. */ + *space -= xfsize(old - 1, old, off, *space); + old--; + + if (new == old) + /* There was only one page. */ + return; + + /* Subtract first one. */ + if (new == 0) { + *space -= xfsize(0, old, off, *space); + new++; + } + + /* Rest of pages are full sized. */ + *space -= (old - new) * PAGE_SIZE; + + KASSERT(*space >= 0, ("%s: space went backwards", __func__)); +} + +/* + * Structure describing a single sendfile(2) I/O, which may consist of + * several underlying pager I/Os. + * + * The syscall context allocates the structure and initializes 'nios' + * to 1. As sendfile_swapin() runs through pages and starts asynchronous + * paging operations, it increments 'nios'. + * + * Every I/O completion calls sf_iodone(), which decrements the 'nios', and + * the syscall also calls sf_iodone() after allocating all mbufs, linking them + * and sending to socket. Whoever reaches zero 'nios' is responsible to + * call pru_ready on the socket, to notify it of readyness of the data. + */ +struct sf_io { + volatile u_int nios; + u_int error; + int npages; + struct file *sock_fp; + struct mbuf *m; + vm_page_t pa[]; +}; + +static void +sf_iodone(void *arg, vm_page_t *pg, int count, int error) +{ + struct sf_io *sfio = arg; + struct socket *so; + + for (int i = 0; i < count; i++) + vm_page_xunbusy(pg[i]); + + if (error) + sfio->error = error; + + if (!refcount_release(&sfio->nios)) + return; + + so = sfio->sock_fp->f_data; + + if (sfio->error) { + struct mbuf *m; + + /* + * I/O operation failed. The state of data in the socket + * is now inconsistent, and all what we can do is to tear + * it down. Protocol abort method would tear down protocol + * state, free all ready mbufs and detach not ready ones. + * We will free the mbufs corresponding to this I/O manually. + * + * The socket would be marked with EIO and made available + * for read, so that application receives EIO on next + * syscall and eventually closes the socket. + */ + so->so_proto->pr_usrreqs->pru_abort(so); + so->so_error = EIO; + + m = sfio->m; + for (int i = 0; i < sfio->npages; i++) + m = m_free(m); + } else { + CURVNET_SET(so->so_vnet); + (void )(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m, + sfio->npages); + CURVNET_RESTORE(); + } + + /* XXXGL: curthread */ + fdrop(sfio->sock_fp, curthread); + free(sfio, M_TEMP); +} + +/* + * Iterate through pages vector and request paging for non-valid pages. + */ static int -sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd, - off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res) +sendfile_swapin(vm_object_t obj, struct sf_io *sfio, off_t off, off_t len, + int npages, int rhpages, int flags) { - vm_page_t m; - vm_pindex_t pindex; - ssize_t resid; - int error, readahead, rv; + vm_page_t *pa = sfio->pa; + int nios; - pindex = OFF_TO_IDX(off); - VM_OBJECT_WLOCK(obj); - m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY | - VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL); + nios = 0; + flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0; /* - * Check if page is valid for what we need, otherwise initiate I/O. - * - * The non-zero nd argument prevents disk I/O, instead we - * return the caller what he specified in nd. In particular, - * if we already turned some pages into mbufs, nd == EAGAIN - * and the main function send them the pages before we come - * here again and block. + * First grab all the pages and wire them. Note that we grab + * only required pages. Readahead pages are dealt with later. */ - if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) { - if (vp == NULL) - vm_page_xunbusy(m); - VM_OBJECT_WUNLOCK(obj); - *res = m; - return (0); - } else if (nd != 0) { - if (vp == NULL) - vm_page_xunbusy(m); - error = nd; - goto free_page; + VM_OBJECT_WLOCK(obj); + for (int i = 0; i < npages; i++) { + pa[i] = vm_page_grab(obj, OFF_TO_IDX(vmoff(i, off)), + VM_ALLOC_WIRED | VM_ALLOC_NORMAL | flags); + if (pa[i] == NULL) { + npages = i; + rhpages = 0; + break; + } } - /* - * Get the page from backing store. - */ - error = 0; - if (vp != NULL) { - VM_OBJECT_WUNLOCK(obj); - readahead = sfreadahead * MAXBSIZE; + for (int i = 0; i < npages;) { + int j, a, count, rv; + /* Skip valid pages. */ + if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK, + xfsize(i, npages, off, len))) { + vm_page_xunbusy(pa[i]); + SFSTAT_INC(sf_pages_valid); + i++; + continue; + } + /* - * Use vn_rdwr() instead of the pager interface for - * the vnode, to allow the read-ahead. - * - * XXXMAC: Because we don't have fp->f_cred here, we - * pass in NOCRED. This is probably wrong, but is - * consistent with our original implementation. + * Now 'i' points to first invalid page, iterate further + * to make 'j' point at first valid after a bunch of + * invalid ones. */ - error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off), - UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead / - bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); - SFSTAT_INC(sf_iocnt); - VM_OBJECT_WLOCK(obj); - } else { - if (vm_pager_has_page(obj, pindex, NULL, NULL)) { - rv = vm_pager_get_pages(obj, &m, 1, NULL, NULL); - SFSTAT_INC(sf_iocnt); - if (rv != VM_PAGER_OK) { - vm_page_lock(m); - vm_page_free(m); - vm_page_unlock(m); - m = NULL; - error = EIO; + for (j = i + 1; j < npages; j++) + if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK, + xfsize(j, npages, off, len))) { + SFSTAT_INC(sf_pages_valid); + break; } - } else { - pmap_zero_page(m); - m->valid = VM_PAGE_BITS_ALL; - m->dirty = 0; + + /* + * Now we got region of invalid pages between 'i' and 'j'. + * Check that they belong to pager. They may not be there, + * which is a regular situation for shmem pager. For vnode + * pager this happens only in case of sparse file. + * + * Important feature of vm_pager_has_page() is the hint + * stored in 'a', about how many pages we can pagein after + * this page in a single I/O. + */ + while (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)), + NULL, &a) && i < j) { + pmap_zero_page(pa[i]); + pa[i]->valid = VM_PAGE_BITS_ALL; + pa[i]->dirty = 0; + vm_page_xunbusy(pa[i]); + i++; } - if (m != NULL) - vm_page_xunbusy(m); - } - if (error == 0) { - *res = m; - } else if (m != NULL) { -free_page: - vm_page_lock(m); - vm_page_unwire(m, PQ_INACTIVE); + if (i == j) + continue; /* - * See if anyone else might know about this page. If - * not and it is not valid, then free it. + * We want to pagein as many pages as possible, limited only + * by the 'a' hint and actual request. + * + * We should not pagein into already valid page, thus if + * 'j' didn't reach last page, trim by that page. + * + * When the pagein fulfils the request, also specify readahead. */ - if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m)) - vm_page_free(m); - vm_page_unlock(m); + if (j < npages) + a = min(a, j - i - 1); + count = min(a + 1, npages - i); + + refcount_acquire(&sfio->nios); + rv = vm_pager_get_pages_async(obj, pa + i, count, NULL, + i + count == npages ? &rhpages : NULL, + &sf_iodone, sfio); + KASSERT(rv == VM_PAGER_OK, ("%s: pager fail obj %p page %p", + __func__, obj, pa[i])); + + SFSTAT_INC(sf_iocnt); + SFSTAT_ADD(sf_pages_read, count); + if (i + count == npages) + SFSTAT_ADD(sf_rhpages_read, rhpages); + +#ifdef INVARIANTS + for (j = i; j < i + count && j < npages; j++) + KASSERT(pa[j] == vm_page_lookup(obj, + OFF_TO_IDX(vmoff(j, off))), + ("pa[j] %p lookup %p\n", pa[j], + vm_page_lookup(obj, OFF_TO_IDX(vmoff(j, off))))); +#endif + i += count; + nios++; } - KASSERT(error != 0 || (m->wire_count > 0 && - vm_page_is_valid(m, off & PAGE_MASK, xfsize)), - ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off, - xfsize)); + VM_OBJECT_WUNLOCK(obj); - return (error); + + if (nios == 0 && npages != 0) + SFSTAT_INC(sf_noiocnt); + + return (nios); } static int sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, int *bsize) { struct vattr va; vm_object_t obj; struct vnode *vp; struct shmfd *shmfd; int error; vp = *vp_res = NULL; obj = NULL; shmfd = *shmfd_res = NULL; *bsize = 0; /* * The file descriptor must be a regular file and have a * backing VM object. */ if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_type != VREG) { error = EINVAL; goto out; } *bsize = vp->v_mount->mnt_stat.f_iosize; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0) goto out; *obj_size = va.va_size; obj = vp->v_object; if (obj == NULL) { error = EINVAL; goto out; } } else if (fp->f_type == DTYPE_SHM) { error = 0; shmfd = fp->f_data; obj = shmfd->shm_object; *obj_size = shmfd->shm_size; } else { error = EINVAL; goto out; } VM_OBJECT_WLOCK(obj); if ((obj->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(obj); error = EBADF; goto out; } /* * Temporarily increase the backing VM object's reference * count so that a forced reclamation of its vnode does not * immediately destroy it. */ vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); *obj_res = obj; *vp_res = vp; *shmfd_res = shmfd; out: if (vp != NULL) VOP_UNLOCK(vp, 0); return (error); } static int kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp, struct socket **so) { cap_rights_t rights; int error; *sock_fp = NULL; *so = NULL; /* * The socket must be a stream socket and connected. */ error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND), sock_fp, NULL); if (error != 0) return (error); *so = (*sock_fp)->f_data; if ((*so)->so_type != SOCK_STREAM) return (EINVAL); if (((*so)->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); return (0); } int vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, int kflags, struct thread *td) { struct file *sock_fp; struct vnode *vp; struct vm_object *obj; struct socket *so; - struct mbuf *m; + struct mbuf *m, *mh, *mhtail; struct sf_buf *sf; - struct vm_page *pg; struct shmfd *shmfd; struct sendfile_sync *sfs; struct vattr va; - off_t off, xfsize, fsbytes, sbytes, rem, obj_size; - int error, bsize, nd, hdrlen, mnw; + off_t off, sbytes, rem, obj_size; + int error, softerr, bsize, hdrlen; - pg = NULL; obj = NULL; so = NULL; - m = NULL; + m = mh = NULL; sfs = NULL; - fsbytes = sbytes = 0; - hdrlen = mnw = 0; - rem = nbytes; - obj_size = 0; + sbytes = 0; + softerr = 0; error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); if (error != 0) return (error); - if (rem == 0) - rem = obj_size; error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so); if (error != 0) goto out; - /* - * Do not wait on memory allocations but return ENOMEM for - * caller to retry later. - * XXX: Experimental. - */ - if (flags & SF_MNOWAIT) - mnw = 1; +#ifdef MAC + error = mac_socket_check_send(td->td_ucred, so); + if (error != 0) + goto out; +#endif + SFSTAT_INC(sf_syscalls); + SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags)); + if (flags & SF_SYNC) { sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO); mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); cv_init(&sfs->cv, "sendfile"); } -#ifdef MAC - error = mac_socket_check_send(td->td_ucred, so); - if (error != 0) - goto out; -#endif - /* If headers are specified copy them into mbufs. */ - if (hdr_uio != NULL) { + if (hdr_uio != NULL && hdr_uio->uio_resid > 0) { hdr_uio->uio_td = td; hdr_uio->uio_rw = UIO_WRITE; - if (hdr_uio->uio_resid > 0) { - /* - * In FBSD < 5.0 the nbytes to send also included - * the header. If compat is specified subtract the - * header size from nbytes. - */ - if (kflags & SFK_COMPAT) { - if (nbytes > hdr_uio->uio_resid) - nbytes -= hdr_uio->uio_resid; - else - nbytes = 0; - } - m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), - 0, 0, 0); - if (m == NULL) { - error = mnw ? EAGAIN : ENOBUFS; - goto out; - } - hdrlen = m_length(m, NULL); + /* + * In FBSD < 5.0 the nbytes to send also included + * the header. If compat is specified subtract the + * header size from nbytes. + */ + if (kflags & SFK_COMPAT) { + if (nbytes > hdr_uio->uio_resid) + nbytes -= hdr_uio->uio_resid; + else + nbytes = 0; } - } + mh = m_uiotombuf(hdr_uio, M_WAITOK, 0, 0, 0); + hdrlen = m_length(mh, &mhtail); + } else + hdrlen = 0; + rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset; + /* * Protect against multiple writers to the socket. * * XXXRW: Historically this has assumed non-interruptibility, so now * we implement that, but possibly shouldn't. */ (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); /* * Loop through the pages of the file, starting with the requested * offset. Get a file page (do I/O if necessary), map the file page * into an sf_buf, attach an mbuf header to the sf_buf, and queue * it on the socket. * This is done in two loops. The inner loop turns as many pages * as it can, up to available socket buffer space, without blocking * into mbufs to have it bulk delivered into the socket send buffer. * The outer loop checks the state and available space of the socket * and takes care of the overall progress. */ - for (off = offset; ; ) { + for (off = offset; rem > 0; ) { + struct sf_io *sfio; + vm_page_t *pa; struct mbuf *mtail; - int loopbytes; - int space; - int done; + int nios, space, npages, rhpages; - if ((nbytes != 0 && nbytes == fsbytes) || - (nbytes == 0 && obj_size == fsbytes)) - break; - mtail = NULL; - loopbytes = 0; - space = 0; - done = 0; - /* * Check the socket state for ongoing connection, * no errors and space in socket buffer. * If space is low allow for the remainder of the * file to be processed if it fits the socket buffer. * Otherwise block in waiting for sufficient space * to proceed, or if the socket is nonblocking, return * to userland with EAGAIN while reporting how far * we've come. * We wait until the socket buffer has significant free * space to do bulk sends. This makes good use of file * system read ahead and allows packet segmentation * offloading hardware to take over lots of work. If * we were not careful here we would send off only one * sfbuf at a time. */ SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; retry_space: if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; SOCKBUF_UNLOCK(&so->so_snd); goto done; } else if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto done; } space = sbspace(&so->so_snd); if (space < rem && (space <= 0 || space < so->so_snd.sb_lowat)) { if (so->so_state & SS_NBIO) { SOCKBUF_UNLOCK(&so->so_snd); error = EAGAIN; goto done; } /* * sbwait drops the lock while sleeping. * When we loop back to retry_space the * state may have changed and we retest * for it. */ error = sbwait(&so->so_snd); /* * An error from sbwait usually indicates that we've * been interrupted by a signal. If we've sent anything * then return bytes sent, otherwise return the error. */ if (error != 0) { SOCKBUF_UNLOCK(&so->so_snd); goto done; } goto retry_space; } SOCKBUF_UNLOCK(&so->so_snd); /* * Reduce space in the socket buffer by the size of * the header mbuf chain. * hdrlen is set to 0 after the first loop. */ space -= hdrlen; if (vp != NULL) { error = vn_lock(vp, LK_SHARED); if (error != 0) goto done; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0 || off >= va.va_size) { VOP_UNLOCK(vp, 0); goto done; } - obj_size = va.va_size; + if (va.va_size != obj_size) { + if (nbytes == 0) + rem += va.va_size - obj_size; + else if (offset + nbytes > va.va_size) + rem -= (offset + nbytes - va.va_size); + obj_size = va.va_size; + } } + if (space > rem) + space = rem; + + npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE); + /* + * Calculate maximum allowed number of pages for readahead + * at this iteration. First, we allow readahead up to "rem". + * If application wants more, let it be, but there is no + * reason to go above MAXPHYS. Also check against "obj_size", + * since vm_pager_has_page() can hint beyond EOF. + */ + rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) - npages; + rhpages += SF_READAHEAD(flags); + rhpages = min(howmany(MAXPHYS, PAGE_SIZE), rhpages); + rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) - + npages, rhpages); + + sfio = malloc(sizeof(struct sf_io) + + npages * sizeof(vm_page_t), M_TEMP, M_WAITOK); + refcount_init(&sfio->nios, 1); + sfio->error = 0; + + nios = sendfile_swapin(obj, sfio, off, space, npages, rhpages, + flags); + + /* * Loop and construct maximum sized mbuf chain to be bulk * dumped into socket buffer. */ - while (space > loopbytes) { - vm_offset_t pgoff; + pa = sfio->pa; + for (int i = 0; i < npages; i++) { struct mbuf *m0; /* - * Calculate the amount to transfer. - * Not to exceed a page, the EOF, - * or the passed in nbytes. + * If a page wasn't grabbed successfully, then + * trim the array. Can happen only with SF_NODISKIO. */ - pgoff = (vm_offset_t)(off & PAGE_MASK); - rem = obj_size - offset; - if (nbytes != 0) - rem = omin(rem, nbytes); - rem -= fsbytes + loopbytes; - xfsize = omin(PAGE_SIZE - pgoff, rem); - xfsize = omin(space - loopbytes, xfsize); - if (xfsize <= 0) { - done = 1; /* all data sent */ + if (pa[i] == NULL) { + SFSTAT_INC(sf_busy); + fixspace(npages, i, off, &space); + npages = i; + softerr = EBUSY; break; } /* - * Attempt to look up the page. Allocate - * if not found or wait and loop if busy. - */ - if (m != NULL) - nd = EAGAIN; /* send what we already got */ - else if ((flags & SF_NODISKIO) != 0) - nd = EBUSY; - else - nd = 0; - error = sendfile_readpage(obj, vp, nd, off, - xfsize, bsize, td, &pg); - if (error != 0) { - if (error == EAGAIN) - error = 0; /* not a real error */ - break; - } - - /* * Get a sendfile buf. When allocating the * first buffer for mbuf chain, we usually * wait as long as necessary, but this wait * can be interrupted. For consequent * buffers, do not sleep, since several * threads might exhaust the buffers and then * deadlock. */ - sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT : - SFB_CATCH); + sf = sf_buf_alloc(pa[i], + m != NULL ? SFB_NOWAIT : SFB_CATCH); if (sf == NULL) { SFSTAT_INC(sf_allocfail); - vm_page_lock(pg); - vm_page_unwire(pg, PQ_INACTIVE); - KASSERT(pg->object != NULL, - ("%s: object disappeared", __func__)); - vm_page_unlock(pg); + for (int j = i; j < npages; j++) { + vm_page_lock(pa[j]); + vm_page_unwire(pa[j], PQ_INACTIVE); + vm_page_unlock(pa[j]); + } if (m == NULL) - error = (mnw ? EAGAIN : EINTR); + softerr = ENOBUFS; + fixspace(npages, i, off, &space); + npages = i; break; } - /* - * Get an mbuf and set it up as having - * external storage. - */ - m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); - if (m0 == NULL) { - error = (mnw ? EAGAIN : ENOBUFS); - sf_ext_free(sf, NULL); - break; - } - /* - * Attach EXT_SFBUF external storage. - */ - m0->m_ext.ext_buf = (caddr_t )sf_buf_kva(sf); + m0 = m_get(M_WAITOK, MT_DATA); + m0->m_ext.ext_buf = (char *)sf_buf_kva(sf); m0->m_ext.ext_size = PAGE_SIZE; m0->m_ext.ext_arg1 = sf; m0->m_ext.ext_arg2 = sfs; - m0->m_ext.ext_type = EXT_SFBUF; + /* + * SF_NOCACHE sets the page as being freed upon send. + * However, we ignore it for the last page in 'space', + * if the page is truncated, and we got more data to + * send (rem > space), or if we have readahead + * configured (rhpages > 0). + */ + if ((flags & SF_NOCACHE) == 0 || + (i == npages - 1 && + ((off + space) & PAGE_MASK) && + (rem > space || rhpages > 0))) + m0->m_ext.ext_type = EXT_SFBUF; + else + m0->m_ext.ext_type = EXT_SFBUF_NOCACHE; m0->m_ext.ext_flags = 0; - m0->m_flags |= (M_EXT|M_RDONLY); - m0->m_data = (char *)sf_buf_kva(sf) + pgoff; - m0->m_len = xfsize; + m0->m_flags |= (M_EXT | M_RDONLY); + if (nios) + m0->m_flags |= M_NOTREADY; + m0->m_data = (char *)sf_buf_kva(sf) + + (vmoff(i, off) & PAGE_MASK); + m0->m_len = xfsize(i, npages, off, space); + if (i == 0) + sfio->m = m0; + /* Append to mbuf chain. */ if (mtail != NULL) mtail->m_next = m0; - else if (m != NULL) - m_last(m)->m_next = m0; else m = m0; mtail = m0; - /* Keep track of bits processed. */ - loopbytes += xfsize; - off += xfsize; - if (sfs != NULL) { mtx_lock(&sfs->mtx); sfs->count++; mtx_unlock(&sfs->mtx); } } if (vp != NULL) VOP_UNLOCK(vp, 0); + /* Keep track of bytes processed. */ + off += space; + rem -= space; + + /* Prepend header, if any. */ + if (hdrlen) { + mhtail->m_next = m; + m = mh; + mh = NULL; + } + + if (m == NULL) { + KASSERT(softerr, ("%s: m NULL, no error", __func__)); + error = softerr; + free(sfio, M_TEMP); + goto done; + } + /* Add the buffer chain to the socket buffer. */ - if (m != NULL) { - int mlen, err; + KASSERT(m_length(m, NULL) == space + hdrlen, + ("%s: mlen %u space %d hdrlen %d", + __func__, m_length(m, NULL), space, hdrlen)); - mlen = m_length(m, NULL); - SOCKBUF_LOCK(&so->so_snd); - if (so->so_snd.sb_state & SBS_CANTSENDMORE) { - error = EPIPE; - SOCKBUF_UNLOCK(&so->so_snd); - goto done; - } - SOCKBUF_UNLOCK(&so->so_snd); - CURVNET_SET(so->so_vnet); - /* Avoid error aliasing. */ - err = (*so->so_proto->pr_usrreqs->pru_send) - (so, 0, m, NULL, NULL, td); - CURVNET_RESTORE(); - if (err == 0) { - /* - * We need two counters to get the - * file offset and nbytes to send - * right: - * - sbytes contains the total amount - * of bytes sent, including headers. - * - fsbytes contains the total amount - * of bytes sent from the file. - */ - sbytes += mlen; - fsbytes += mlen; - if (hdrlen) { - fsbytes -= hdrlen; - hdrlen = 0; - } - } else if (error == 0) - error = err; - m = NULL; /* pru_send always consumes */ + CURVNET_SET(so->so_vnet); + if (nios == 0) { + /* + * If sendfile_swapin() didn't initiate any I/Os, + * which happens if all data is cached in VM, then + * we can send data right now without the + * PRUS_NOTREADY flag. + */ + free(sfio, M_TEMP); + error = (*so->so_proto->pr_usrreqs->pru_send) + (so, 0, m, NULL, NULL, td); + } else { + sfio->sock_fp = sock_fp; + sfio->npages = npages; + fhold(sock_fp); + error = (*so->so_proto->pr_usrreqs->pru_send) + (so, PRUS_NOTREADY, m, NULL, NULL, td); + sf_iodone(sfio, NULL, 0, 0); } + CURVNET_RESTORE(); - /* Quit outer loop on error or when we're done. */ - if (done) - break; - if (error != 0) + m = NULL; /* pru_send always consumes */ + if (error) goto done; + sbytes += space + hdrlen; + if (hdrlen) + hdrlen = 0; + if (softerr) { + error = softerr; + goto done; + } } /* * Send trailers. Wimp out and use writev(2). */ if (trl_uio != NULL) { sbunlock(&so->so_snd); error = kern_writev(td, sockfd, trl_uio); if (error == 0) sbytes += td->td_retval[0]; goto out; } done: sbunlock(&so->so_snd); out: /* * If there was no error we have to clear td->td_retval[0] * because it may have been set by writev. */ if (error == 0) { td->td_retval[0] = 0; } if (sent != NULL) { (*sent) = sbytes; } if (obj != NULL) vm_object_deallocate(obj); if (so) fdrop(sock_fp, td); if (m) m_freem(m); + if (mh) + m_freem(mh); if (sfs != NULL) { mtx_lock(&sfs->mtx); if (sfs->count != 0) cv_wait(&sfs->cv, &sfs->mtx); KASSERT(sfs->count == 0, ("sendfile sync still busy")); cv_destroy(&sfs->cv); mtx_destroy(&sfs->mtx); free(sfs, M_TEMP); } if (error == ERESTART) error = EINTR; return (error); } Index: head/sys/sys/mbuf.h =================================================================== --- head/sys/sys/mbuf.h (revision 293438) +++ head/sys/sys/mbuf.h (revision 293439) @@ -1,1311 +1,1313 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mbuf.h 8.5 (Berkeley) 2/19/95 * $FreeBSD$ */ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ /* XXX: These includes suck. Sorry! */ #include #ifdef _KERNEL #include #include #ifdef WITNESS #include #endif #endif /* * Mbufs are of a single size, MSIZE (sys/param.h), which includes overhead. * An mbuf may add a single "mbuf cluster" of size MCLBYTES (also in * sys/param.h), which has no additional overhead and is used instead of the * internal data area; this is done when at least MINCLSIZE of data must be * stored. Additionally, it is possible to allocate a separate buffer * externally and attach it to the mbuf in a way similar to that of mbuf * clusters. * * NB: These calculation do not take actual compiler-induced alignment and * padding inside the complete struct mbuf into account. Appropriate * attention is required when changing members of struct mbuf. * * MLEN is data length in a normal mbuf. * MHLEN is data length in an mbuf with pktheader. * MINCLSIZE is a smallest amount of data that should be put into cluster. * * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are sensible. */ struct mbuf; #define MHSIZE offsetof(struct mbuf, m_dat) #define MPKTHSIZE offsetof(struct mbuf, m_pktdat) #define MLEN ((int)(MSIZE - MHSIZE)) #define MHLEN ((int)(MSIZE - MPKTHSIZE)) #define MINCLSIZE (MHLEN + 1) #ifdef _KERNEL /*- * Macro for type conversion: convert mbuf pointer to data pointer of correct * type: * * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. * mtodo(m, o) -- Same as above but with offset 'o' into data. */ #define mtod(m, t) ((t)((m)->m_data)) #define mtodo(m, o) ((void *)(((m)->m_data) + (o))) /* * Argument structure passed to UMA routines during mbuf and packet * allocations. */ struct mb_args { int flags; /* Flags for mbuf being allocated */ short type; /* Type of mbuf being allocated */ }; #endif /* _KERNEL */ /* * Packet tag structure (see below for details). */ struct m_tag { SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ u_int16_t m_tag_id; /* Tag ID */ u_int16_t m_tag_len; /* Length of data */ u_int32_t m_tag_cookie; /* ABI/Module ID */ void (*m_tag_free)(struct m_tag *); }; /* * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set. * Size ILP32: 48 * LP64: 56 * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are correct. */ struct pkthdr { struct ifnet *rcvif; /* rcv interface */ SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ int32_t len; /* total packet length */ /* Layer crossing persistent information. */ uint32_t flowid; /* packet's 4-tuple system */ uint64_t csum_flags; /* checksum and offload features */ uint16_t fibnum; /* this packet should use this fib */ uint8_t cosqos; /* class/quality of service */ uint8_t rsstype; /* hash type */ uint8_t l2hlen; /* layer 2 header length */ uint8_t l3hlen; /* layer 3 header length */ uint8_t l4hlen; /* layer 4 header length */ uint8_t l5hlen; /* layer 5 header length */ union { uint8_t eight[8]; uint16_t sixteen[4]; uint32_t thirtytwo[2]; uint64_t sixtyfour[1]; uintptr_t unintptr[1]; void *ptr; } PH_per; /* Layer specific non-persistent local storage for reassembly, etc. */ union { uint8_t eight[8]; uint16_t sixteen[4]; uint32_t thirtytwo[2]; uint64_t sixtyfour[1]; uintptr_t unintptr[1]; void *ptr; } PH_loc; }; #define ether_vtag PH_per.sixteen[0] #define PH_vt PH_per #define vt_nrecs sixteen[0] #define tso_segsz PH_per.sixteen[1] #define csum_phsum PH_per.sixteen[2] #define csum_data PH_per.thirtytwo[1] /* * Description of external storage mapped into mbuf; valid only if M_EXT is * set. * Size ILP32: 28 * LP64: 48 * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are correct. */ struct m_ext { volatile u_int *ext_cnt; /* pointer to ref count info */ caddr_t ext_buf; /* start of buffer */ uint32_t ext_size; /* size of buffer, for ext_free */ uint32_t ext_type:8, /* type of external storage */ ext_flags:24; /* external storage mbuf flags */ void (*ext_free) /* free routine if not the usual */ (struct mbuf *, void *, void *); void *ext_arg1; /* optional argument pointer */ void *ext_arg2; /* optional argument pointer */ }; /* * The core of the mbuf object along with some shortcut defines for practical * purposes. */ struct mbuf { /* * Header present at the beginning of every mbuf. * Size ILP32: 24 * LP64: 32 * Compile-time assertions in uipc_mbuf.c test these values to ensure * that they are correct. */ union { /* next buffer in chain */ struct mbuf *m_next; SLIST_ENTRY(mbuf) m_slist; STAILQ_ENTRY(mbuf) m_stailq; }; union { /* next chain in queue/record */ struct mbuf *m_nextpkt; SLIST_ENTRY(mbuf) m_slistpkt; STAILQ_ENTRY(mbuf) m_stailqpkt; }; caddr_t m_data; /* location of data */ int32_t m_len; /* amount of data in this mbuf */ uint32_t m_type:8, /* type of data in this mbuf */ m_flags:24; /* flags; see below */ #if !defined(__LP64__) uint32_t m_pad; /* pad for 64bit alignment */ #endif /* * A set of optional headers (packet header, external storage header) * and internal data storage. Historically, these arrays were sized * to MHLEN (space left after a packet header) and MLEN (space left * after only a regular mbuf header); they are now variable size in * order to support future work on variable-size mbufs. */ union { struct { struct pkthdr m_pkthdr; /* M_PKTHDR set */ union { struct m_ext m_ext; /* M_EXT set */ char m_pktdat[0]; }; }; char m_dat[0]; /* !M_PKTHDR, !M_EXT */ }; }; /* * mbuf flags of global significance and layer crossing. * Those of only protocol/layer specific significance are to be mapped * to M_PROTO[1-12] and cleared at layer handoff boundaries. * NB: Limited to the lower 24 bits. */ #define M_EXT 0x00000001 /* has associated external storage */ #define M_PKTHDR 0x00000002 /* start of record */ #define M_EOR 0x00000004 /* end of record */ #define M_RDONLY 0x00000008 /* associated data is marked read-only */ #define M_BCAST 0x00000010 /* send/received as link-level broadcast */ #define M_MCAST 0x00000020 /* send/received as link-level multicast */ #define M_PROMISC 0x00000040 /* packet was not for us */ #define M_VLANTAG 0x00000080 /* ether_vtag is valid */ #define M_UNUSED_8 0x00000100 /* --available-- */ #define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */ #define M_PROTO1 0x00001000 /* protocol-specific */ #define M_PROTO2 0x00002000 /* protocol-specific */ #define M_PROTO3 0x00004000 /* protocol-specific */ #define M_PROTO4 0x00008000 /* protocol-specific */ #define M_PROTO5 0x00010000 /* protocol-specific */ #define M_PROTO6 0x00020000 /* protocol-specific */ #define M_PROTO7 0x00040000 /* protocol-specific */ #define M_PROTO8 0x00080000 /* protocol-specific */ #define M_PROTO9 0x00100000 /* protocol-specific */ #define M_PROTO10 0x00200000 /* protocol-specific */ #define M_PROTO11 0x00400000 /* protocol-specific */ #define M_PROTO12 0x00800000 /* protocol-specific */ /* * Flags to purge when crossing layers. */ #define M_PROTOFLAGS \ (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\ M_PROTO9|M_PROTO10|M_PROTO11|M_PROTO12) /* * Flags preserved when copying m_pkthdr. */ #define M_COPYFLAGS \ (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG| \ M_PROTOFLAGS) /* * Mbuf flag description for use with printf(9) %b identifier. */ #define M_FLAG_BITS \ "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \ "\7M_PROMISC\10M_VLANTAG" #define M_FLAG_PROTOBITS \ "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \ "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \ "\27M_PROTO11\30M_PROTO12" #define M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS) /* * Network interface cards are able to hash protocol fields (such as IPv4 * addresses and TCP port numbers) classify packets into flows. These flows * can then be used to maintain ordering while delivering packets to the OS * via parallel input queues, as well as to provide a stateless affinity * model. NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set * m_flag fields to indicate how the hash should be interpreted by the * network stack. * * Most NICs support RSS, which provides ordering and explicit affinity, and * use the hash m_flag bits to indicate what header fields were covered by * the hash. M_HASHTYPE_OPAQUE can be set by non-RSS cards or configurations * that provide an opaque flow identifier, allowing for ordering and * distribution without explicit affinity. */ /* Microsoft RSS standard hash types */ #define M_HASHTYPE_NONE 0 #define M_HASHTYPE_RSS_IPV4 1 /* IPv4 2-tuple */ #define M_HASHTYPE_RSS_TCP_IPV4 2 /* TCPv4 4-tuple */ #define M_HASHTYPE_RSS_IPV6 3 /* IPv6 2-tuple */ #define M_HASHTYPE_RSS_TCP_IPV6 4 /* TCPv6 4-tuple */ #define M_HASHTYPE_RSS_IPV6_EX 5 /* IPv6 2-tuple + ext hdrs */ #define M_HASHTYPE_RSS_TCP_IPV6_EX 6 /* TCPv6 4-tiple + ext hdrs */ /* Non-standard RSS hash types */ #define M_HASHTYPE_RSS_UDP_IPV4 7 /* IPv4 UDP 4-tuple */ #define M_HASHTYPE_RSS_UDP_IPV4_EX 8 /* IPv4 UDP 4-tuple + ext hdrs */ #define M_HASHTYPE_RSS_UDP_IPV6 9 /* IPv6 UDP 4-tuple */ #define M_HASHTYPE_RSS_UDP_IPV6_EX 10 /* IPv6 UDP 4-tuple + ext hdrs */ #define M_HASHTYPE_OPAQUE 255 /* ordering, not affinity */ #define M_HASHTYPE_CLEAR(m) ((m)->m_pkthdr.rsstype = 0) #define M_HASHTYPE_GET(m) ((m)->m_pkthdr.rsstype) #define M_HASHTYPE_SET(m, v) ((m)->m_pkthdr.rsstype = (v)) #define M_HASHTYPE_TEST(m, v) (M_HASHTYPE_GET(m) == (v)) /* * COS/QOS class and quality of service tags. * It uses DSCP code points as base. */ #define QOS_DSCP_CS0 0x00 #define QOS_DSCP_DEF QOS_DSCP_CS0 #define QOS_DSCP_CS1 0x20 #define QOS_DSCP_AF11 0x28 #define QOS_DSCP_AF12 0x30 #define QOS_DSCP_AF13 0x38 #define QOS_DSCP_CS2 0x40 #define QOS_DSCP_AF21 0x48 #define QOS_DSCP_AF22 0x50 #define QOS_DSCP_AF23 0x58 #define QOS_DSCP_CS3 0x60 #define QOS_DSCP_AF31 0x68 #define QOS_DSCP_AF32 0x70 #define QOS_DSCP_AF33 0x78 #define QOS_DSCP_CS4 0x80 #define QOS_DSCP_AF41 0x88 #define QOS_DSCP_AF42 0x90 #define QOS_DSCP_AF43 0x98 #define QOS_DSCP_CS5 0xa0 #define QOS_DSCP_EF 0xb8 #define QOS_DSCP_CS6 0xc0 #define QOS_DSCP_CS7 0xe0 /* * External mbuf storage buffer types. */ #define EXT_CLUSTER 1 /* mbuf cluster */ -#define EXT_SFBUF 2 /* sendfile(2)'s sf_bufs */ +#define EXT_SFBUF 2 /* sendfile(2)'s sf_buf */ #define EXT_JUMBOP 3 /* jumbo cluster page sized */ #define EXT_JUMBO9 4 /* jumbo cluster 9216 bytes */ #define EXT_JUMBO16 5 /* jumbo cluster 16184 bytes */ #define EXT_PACKET 6 /* mbuf+cluster from packet zone */ #define EXT_MBUF 7 /* external mbuf reference (M_IOVEC) */ +#define EXT_SFBUF_NOCACHE 8 /* sendfile(2)'s sf_buf not to be cached */ #define EXT_VENDOR1 224 /* for vendor-internal use */ #define EXT_VENDOR2 225 /* for vendor-internal use */ #define EXT_VENDOR3 226 /* for vendor-internal use */ #define EXT_VENDOR4 227 /* for vendor-internal use */ #define EXT_EXP1 244 /* for experimental use */ #define EXT_EXP2 245 /* for experimental use */ #define EXT_EXP3 246 /* for experimental use */ #define EXT_EXP4 247 /* for experimental use */ #define EXT_NET_DRV 252 /* custom ext_buf provided by net driver(s) */ #define EXT_MOD_TYPE 253 /* custom module's ext_buf type */ #define EXT_DISPOSABLE 254 /* can throw this buffer away w/page flipping */ #define EXT_EXTREF 255 /* has externally maintained ext_cnt ptr */ /* * Flags for external mbuf buffer types. * NB: limited to the lower 24 bits. */ #define EXT_FLAG_EMBREF 0x000001 /* embedded ext_cnt, notyet */ #define EXT_FLAG_EXTREF 0x000002 /* external ext_cnt, notyet */ #define EXT_FLAG_NOFREE 0x000010 /* don't free mbuf to pool, notyet */ #define EXT_FLAG_VENDOR1 0x010000 /* for vendor-internal use */ #define EXT_FLAG_VENDOR2 0x020000 /* for vendor-internal use */ #define EXT_FLAG_VENDOR3 0x040000 /* for vendor-internal use */ #define EXT_FLAG_VENDOR4 0x080000 /* for vendor-internal use */ #define EXT_FLAG_EXP1 0x100000 /* for experimental use */ #define EXT_FLAG_EXP2 0x200000 /* for experimental use */ #define EXT_FLAG_EXP3 0x400000 /* for experimental use */ #define EXT_FLAG_EXP4 0x800000 /* for experimental use */ /* * EXT flag description for use with printf(9) %b identifier. */ #define EXT_FLAG_BITS \ "\20\1EXT_FLAG_EMBREF\2EXT_FLAG_EXTREF\5EXT_FLAG_NOFREE" \ "\21EXT_FLAG_VENDOR1\22EXT_FLAG_VENDOR2\23EXT_FLAG_VENDOR3" \ "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \ "\30EXT_FLAG_EXP4" /* * External reference/free functions. */ void sf_ext_ref(void *, void *); void sf_ext_free(void *, void *); +void sf_ext_free_nocache(void *, void *); /* * Flags indicating checksum, segmentation and other offload work to be * done, or already done, by hardware or lower layers. It is split into * separate inbound and outbound flags. * * Outbound flags that are set by upper protocol layers requesting lower * layers, or ideally the hardware, to perform these offloading tasks. * For outbound packets this field and its flags can be directly tested * against ifnet if_hwassist. */ #define CSUM_IP 0x00000001 /* IP header checksum offload */ #define CSUM_IP_UDP 0x00000002 /* UDP checksum offload */ #define CSUM_IP_TCP 0x00000004 /* TCP checksum offload */ #define CSUM_IP_SCTP 0x00000008 /* SCTP checksum offload */ #define CSUM_IP_TSO 0x00000010 /* TCP segmentation offload */ #define CSUM_IP_ISCSI 0x00000020 /* iSCSI checksum offload */ #define CSUM_IP6_UDP 0x00000200 /* UDP checksum offload */ #define CSUM_IP6_TCP 0x00000400 /* TCP checksum offload */ #define CSUM_IP6_SCTP 0x00000800 /* SCTP checksum offload */ #define CSUM_IP6_TSO 0x00001000 /* TCP segmentation offload */ #define CSUM_IP6_ISCSI 0x00002000 /* iSCSI checksum offload */ /* Inbound checksum support where the checksum was verified by hardware. */ #define CSUM_L3_CALC 0x01000000 /* calculated layer 3 csum */ #define CSUM_L3_VALID 0x02000000 /* checksum is correct */ #define CSUM_L4_CALC 0x04000000 /* calculated layer 4 csum */ #define CSUM_L4_VALID 0x08000000 /* checksum is correct */ #define CSUM_L5_CALC 0x10000000 /* calculated layer 5 csum */ #define CSUM_L5_VALID 0x20000000 /* checksum is correct */ #define CSUM_COALESED 0x40000000 /* contains merged segments */ /* * CSUM flag description for use with printf(9) %b identifier. */ #define CSUM_BITS \ "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \ "\6CSUM_IP_ISCSI" \ "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \ "\16CSUM_IP6_ISCSI" \ "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \ "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESED" /* CSUM flags compatibility mappings. */ #define CSUM_IP_CHECKED CSUM_L3_CALC #define CSUM_IP_VALID CSUM_L3_VALID #define CSUM_DATA_VALID CSUM_L4_VALID #define CSUM_PSEUDO_HDR CSUM_L4_CALC #define CSUM_SCTP_VALID CSUM_L4_VALID #define CSUM_DELAY_DATA (CSUM_TCP|CSUM_UDP) #define CSUM_DELAY_IP CSUM_IP /* Only v4, no v6 IP hdr csum */ #define CSUM_DELAY_DATA_IPV6 (CSUM_TCP_IPV6|CSUM_UDP_IPV6) #define CSUM_DATA_VALID_IPV6 CSUM_DATA_VALID #define CSUM_TCP CSUM_IP_TCP #define CSUM_UDP CSUM_IP_UDP #define CSUM_SCTP CSUM_IP_SCTP #define CSUM_TSO (CSUM_IP_TSO|CSUM_IP6_TSO) #define CSUM_UDP_IPV6 CSUM_IP6_UDP #define CSUM_TCP_IPV6 CSUM_IP6_TCP #define CSUM_SCTP_IPV6 CSUM_IP6_SCTP /* * mbuf types describing the content of the mbuf (including external storage). */ #define MT_NOTMBUF 0 /* USED INTERNALLY ONLY! Object is not mbuf */ #define MT_DATA 1 /* dynamic (data) allocation */ #define MT_HEADER MT_DATA /* packet header, use M_PKTHDR instead */ #define MT_VENDOR1 4 /* for vendor-internal use */ #define MT_VENDOR2 5 /* for vendor-internal use */ #define MT_VENDOR3 6 /* for vendor-internal use */ #define MT_VENDOR4 7 /* for vendor-internal use */ #define MT_SONAME 8 /* socket name */ #define MT_EXP1 9 /* for experimental use */ #define MT_EXP2 10 /* for experimental use */ #define MT_EXP3 11 /* for experimental use */ #define MT_EXP4 12 /* for experimental use */ #define MT_CONTROL 14 /* extra-data protocol message */ #define MT_OOBDATA 15 /* expedited data */ #define MT_NTYPES 16 /* number of mbuf types for mbtypes[] */ #define MT_NOINIT 255 /* Not a type but a flag to allocate a non-initialized mbuf */ /* * String names of mbuf-related UMA(9) and malloc(9) types. Exposed to * !_KERNEL so that monitoring tools can look up the zones with * libmemstat(3). */ #define MBUF_MEM_NAME "mbuf" #define MBUF_CLUSTER_MEM_NAME "mbuf_cluster" #define MBUF_PACKET_MEM_NAME "mbuf_packet" #define MBUF_JUMBOP_MEM_NAME "mbuf_jumbo_page" #define MBUF_JUMBO9_MEM_NAME "mbuf_jumbo_9k" #define MBUF_JUMBO16_MEM_NAME "mbuf_jumbo_16k" #define MBUF_TAG_MEM_NAME "mbuf_tag" #define MBUF_EXTREFCNT_MEM_NAME "mbuf_ext_refcnt" #ifdef _KERNEL #ifdef WITNESS #define MBUF_CHECKSLEEP(how) do { \ if (how == M_WAITOK) \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, \ "Sleeping in \"%s\"", __func__); \ } while (0) #else #define MBUF_CHECKSLEEP(how) #endif /* * Network buffer allocation API * * The rest of it is defined in kern/kern_mbuf.c */ extern uma_zone_t zone_mbuf; extern uma_zone_t zone_clust; extern uma_zone_t zone_pack; extern uma_zone_t zone_jumbop; extern uma_zone_t zone_jumbo9; extern uma_zone_t zone_jumbo16; extern uma_zone_t zone_ext_refcnt; void mb_dupcl(struct mbuf *, const struct mbuf *); void mb_free_ext(struct mbuf *); int m_pkthdr_init(struct mbuf *, int); static __inline int m_gettype(int size) { int type; switch (size) { case MSIZE: type = EXT_MBUF; break; case MCLBYTES: type = EXT_CLUSTER; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: type = EXT_JUMBOP; break; #endif case MJUM9BYTES: type = EXT_JUMBO9; break; case MJUM16BYTES: type = EXT_JUMBO16; break; default: panic("%s: invalid cluster size %d", __func__, size); } return (type); } /* * Associated an external reference counted buffer with an mbuf. */ static __inline void m_extaddref(struct mbuf *m, caddr_t buf, u_int size, u_int *ref_cnt, void (*freef)(struct mbuf *, void *, void *), void *arg1, void *arg2) { KASSERT(ref_cnt != NULL, ("%s: ref_cnt not provided", __func__)); atomic_add_int(ref_cnt, 1); m->m_flags |= M_EXT; m->m_ext.ext_buf = buf; m->m_ext.ext_cnt = ref_cnt; m->m_data = m->m_ext.ext_buf; m->m_ext.ext_size = size; m->m_ext.ext_free = freef; m->m_ext.ext_arg1 = arg1; m->m_ext.ext_arg2 = arg2; m->m_ext.ext_type = EXT_EXTREF; m->m_ext.ext_flags = 0; } static __inline uma_zone_t m_getzone(int size) { uma_zone_t zone; switch (size) { case MCLBYTES: zone = zone_clust; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: zone = zone_jumbop; break; #endif case MJUM9BYTES: zone = zone_jumbo9; break; case MJUM16BYTES: zone = zone_jumbo16; break; default: panic("%s: invalid cluster size %d", __func__, size); } return (zone); } /* * Initialize an mbuf with linear storage. * * Inline because the consumer text overhead will be roughly the same to * initialize or call a function with this many parameters and M_PKTHDR * should go away with constant propagation for !MGETHDR. */ static __inline int m_init(struct mbuf *m, uma_zone_t zone __unused, int size __unused, int how, short type, int flags) { int error; m->m_next = NULL; m->m_nextpkt = NULL; m->m_data = m->m_dat; m->m_len = 0; m->m_flags = flags; m->m_type = type; if (flags & M_PKTHDR) { if ((error = m_pkthdr_init(m, how)) != 0) return (error); } return (0); } static __inline struct mbuf * m_get(int how, short type) { struct mb_args args; args.flags = 0; args.type = type; return (uma_zalloc_arg(zone_mbuf, &args, how)); } /* * XXX This should be deprecated, very little use. */ static __inline struct mbuf * m_getclr(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = 0; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); if (m != NULL) bzero(m->m_data, MLEN); return (m); } static __inline struct mbuf * m_gethdr(int how, short type) { struct mb_args args; args.flags = M_PKTHDR; args.type = type; return (uma_zalloc_arg(zone_mbuf, &args, how)); } static __inline struct mbuf * m_getcl(int how, short type, int flags) { struct mb_args args; args.flags = flags; args.type = type; return (uma_zalloc_arg(zone_pack, &args, how)); } static __inline int m_clget(struct mbuf *m, int how) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = (char *)NULL; uma_zalloc_arg(zone_clust, m, how); /* * On a cluster allocation failure, drain the packet zone and retry, * we might be able to loosen a few clusters up on the drain. */ if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { zone_drain(zone_pack); uma_zalloc_arg(zone_clust, m, how); } return (m->m_flags & M_EXT); } /* * m_cljget() is different from m_clget() as it can allocate clusters without * attaching them to an mbuf. In that case the return value is the pointer * to the cluster of the requested size. If an mbuf was specified, it gets * the cluster attached to it and the return value can be safely ignored. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ static __inline void * m_cljget(struct mbuf *m, int how, int size) { uma_zone_t zone; if (m != NULL) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = NULL; } zone = m_getzone(size); return (uma_zalloc_arg(zone, m, how)); } static __inline void m_cljset(struct mbuf *m, void *cl, int type) { uma_zone_t zone; int size; switch (type) { case EXT_CLUSTER: size = MCLBYTES; zone = zone_clust; break; #if MJUMPAGESIZE != MCLBYTES case EXT_JUMBOP: size = MJUMPAGESIZE; zone = zone_jumbop; break; #endif case EXT_JUMBO9: size = MJUM9BYTES; zone = zone_jumbo9; break; case EXT_JUMBO16: size = MJUM16BYTES; zone = zone_jumbo16; break; default: panic("%s: unknown cluster type %d", __func__, type); break; } m->m_data = m->m_ext.ext_buf = cl; m->m_ext.ext_free = m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = type; m->m_ext.ext_flags = 0; m->m_ext.ext_cnt = uma_find_refcnt(zone, cl); m->m_flags |= M_EXT; } static __inline void m_chtype(struct mbuf *m, short new_type) { m->m_type = new_type; } static __inline void m_clrprotoflags(struct mbuf *m) { while (m) { m->m_flags &= ~M_PROTOFLAGS; m = m->m_next; } } static __inline struct mbuf * m_last(struct mbuf *m) { while (m->m_next) m = m->m_next; return (m); } /* * mbuf, cluster, and external object allocation macros (for compatibility * purposes). */ #define M_MOVE_PKTHDR(to, from) m_move_pkthdr((to), (from)) #define MGET(m, how, type) ((m) = m_get((how), (type))) #define MGETHDR(m, how, type) ((m) = m_gethdr((how), (type))) #define MCLGET(m, how) m_clget((m), (how)) #define MEXTADD(m, buf, size, free, arg1, arg2, flags, type) \ (void )m_extadd((m), (caddr_t)(buf), (size), (free), (arg1), (arg2),\ (flags), (type), M_NOWAIT) #define m_getm(m, len, how, type) \ m_getm2((m), (len), (how), (type), M_PKTHDR) /* * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can * be both the local data payload, or an external buffer area, depending on * whether M_EXT is set). */ #define M_WRITABLE(m) (!((m)->m_flags & M_RDONLY) && \ (!(((m)->m_flags & M_EXT)) || \ (*((m)->m_ext.ext_cnt) == 1)) ) \ /* Check if the supplied mbuf has a packet header, or else panic. */ #define M_ASSERTPKTHDR(m) \ KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR, \ ("%s: no mbuf packet header!", __func__)) /* * Ensure that the supplied mbuf is a valid, non-free mbuf. * * XXX: Broken at the moment. Need some UMA magic to make it work again. */ #define M_ASSERTVALID(m) \ KASSERT((((struct mbuf *)m)->m_flags & 0) == 0, \ ("%s: attempted use of a free mbuf!", __func__)) /* * Return the address of the start of the buffer associated with an mbuf, * handling external storage, packet-header mbufs, and regular data mbufs. */ #define M_START(m) \ (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \ ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] : \ &(m)->m_dat[0]) /* * Return the size of the buffer associated with an mbuf, handling external * storage, packet-header mbufs, and regular data mbufs. */ #define M_SIZE(m) \ (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size : \ ((m)->m_flags & M_PKTHDR) ? MHLEN : \ MLEN) /* * Set the m_data pointer of a newly allocated mbuf to place an object of the * specified size at the end of the mbuf, longword aligned. * * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as * separate macros, each asserting that it was called at the proper moment. * This required callers to themselves test the storage type and call the * right one. Rather than require callers to be aware of those layout * decisions, we centralize here. */ static __inline void m_align(struct mbuf *m, int len) { #ifdef INVARIANTS const char *msg = "%s: not a virgin mbuf"; #endif int adjust; KASSERT(m->m_data == M_START(m), (msg, __func__)); adjust = M_SIZE(m) - len; m->m_data += adjust &~ (sizeof(long)-1); } #define M_ALIGN(m, len) m_align(m, len) #define MH_ALIGN(m, len) m_align(m, len) #define MEXT_ALIGN(m, len) m_align(m, len) /* * Compute the amount of space available before the current start of data in * an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. * * NB: In previous versions, M_LEADINGSPACE() would only check M_WRITABLE() * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ #define M_LEADINGSPACE(m) \ (M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0) /* * Compute the amount of space available after the end of data in an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. * * NB: In previous versions, M_TRAILINGSPACE() would only check M_WRITABLE() * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ #define M_TRAILINGSPACE(m) \ (M_WRITABLE(m) ? \ ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0) /* * Arrange to prepend space of size plen to mbuf m. If a new mbuf must be * allocated, how specifies whether to wait. If the allocation fails, the * original mbuf chain is freed and m is set to NULL. */ #define M_PREPEND(m, plen, how) do { \ struct mbuf **_mmp = &(m); \ struct mbuf *_mm = *_mmp; \ int _mplen = (plen); \ int __mhow = (how); \ \ MBUF_CHECKSLEEP(how); \ if (M_LEADINGSPACE(_mm) >= _mplen) { \ _mm->m_data -= _mplen; \ _mm->m_len += _mplen; \ } else \ _mm = m_prepend(_mm, _mplen, __mhow); \ if (_mm != NULL && _mm->m_flags & M_PKTHDR) \ _mm->m_pkthdr.len += _mplen; \ *_mmp = _mm; \ } while (0) /* * Change mbuf to new type. This is a relatively expensive operation and * should be avoided. */ #define MCHTYPE(m, t) m_chtype((m), (t)) /* Length to m_copy to copy all. */ #define M_COPYALL 1000000000 /* Compatibility with 4.3. */ #define m_copy(m, o, l) m_copym((m), (o), (l), M_NOWAIT) extern int max_datalen; /* MHLEN - max_hdr */ extern int max_hdr; /* Largest link + protocol header */ extern int max_linkhdr; /* Largest link-level header */ extern int max_protohdr; /* Largest protocol header */ extern int nmbclusters; /* Maximum number of clusters */ struct uio; void m_adj(struct mbuf *, int); int m_apply(struct mbuf *, int, int, int (*)(void *, void *, u_int), void *); int m_append(struct mbuf *, int, c_caddr_t); void m_cat(struct mbuf *, struct mbuf *); void m_catpkt(struct mbuf *, struct mbuf *); int m_extadd(struct mbuf *, caddr_t, u_int, void (*)(struct mbuf *, void *, void *), void *, void *, int, int, int); struct mbuf *m_collapse(struct mbuf *, int, int); void m_copyback(struct mbuf *, int, int, c_caddr_t); void m_copydata(const struct mbuf *, int, int, caddr_t); struct mbuf *m_copym(const struct mbuf *, int, int, int); struct mbuf *m_copypacket(struct mbuf *, int); void m_copy_pkthdr(struct mbuf *, struct mbuf *); struct mbuf *m_copyup(struct mbuf *, int, int); struct mbuf *m_defrag(struct mbuf *, int); void m_demote_pkthdr(struct mbuf *); void m_demote(struct mbuf *, int, int); struct mbuf *m_devget(char *, int, int, struct ifnet *, void (*)(char *, caddr_t, u_int)); struct mbuf *m_dup(const struct mbuf *, int); int m_dup_pkthdr(struct mbuf *, const struct mbuf *, int); u_int m_fixhdr(struct mbuf *); struct mbuf *m_fragment(struct mbuf *, int, int); void m_freem(struct mbuf *); struct mbuf *m_get2(int, int, short, int); struct mbuf *m_getjcl(int, short, int, int); struct mbuf *m_getm2(struct mbuf *, int, int, short, int); struct mbuf *m_getptr(struct mbuf *, int, int *); u_int m_length(struct mbuf *, struct mbuf **); int m_mbuftouio(struct uio *, struct mbuf *, int); void m_move_pkthdr(struct mbuf *, struct mbuf *); struct mbuf *m_prepend(struct mbuf *, int, int); void m_print(const struct mbuf *, int); struct mbuf *m_pulldown(struct mbuf *, int, int, int *); struct mbuf *m_pullup(struct mbuf *, int); int m_sanity(struct mbuf *, int); struct mbuf *m_split(struct mbuf *, int, int); struct mbuf *m_uiotombuf(struct uio *, int, int, int, int); struct mbuf *m_unshare(struct mbuf *, int); /*- * Network packets may have annotations attached by affixing a list of * "packet tags" to the pkthdr structure. Packet tags are dynamically * allocated semi-opaque data structures that have a fixed header * (struct m_tag) that specifies the size of the memory block and a * pair that identifies it. The cookie is a 32-bit unique * unsigned value used to identify a module or ABI. By convention this value * is chosen as the date+time that the module is created, expressed as the * number of seconds since the epoch (e.g., using date -u +'%s'). The type * value is an ABI/module-specific value that identifies a particular * annotation and is private to the module. For compatibility with systems * like OpenBSD that define packet tags w/o an ABI/module cookie, the value * PACKET_ABI_COMPAT is used to implement m_tag_get and m_tag_find * compatibility shim functions and several tag types are defined below. * Users that do not require compatibility should use a private cookie value * so that packet tag-related definitions can be maintained privately. * * Note that the packet tag returned by m_tag_alloc has the default memory * alignment implemented by malloc. To reference private data one can use a * construct like: * * struct m_tag *mtag = m_tag_alloc(...); * struct foo *p = (struct foo *)(mtag+1); * * if the alignment of struct m_tag is sufficient for referencing members of * struct foo. Otherwise it is necessary to embed struct m_tag within the * private data structure to insure proper alignment; e.g., * * struct foo { * struct m_tag tag; * ... * }; * struct foo *p = (struct foo *) m_tag_alloc(...); * struct m_tag *mtag = &p->tag; */ /* * Persistent tags stay with an mbuf until the mbuf is reclaimed. Otherwise * tags are expected to ``vanish'' when they pass through a network * interface. For most interfaces this happens normally as the tags are * reclaimed when the mbuf is free'd. However in some special cases * reclaiming must be done manually. An example is packets that pass through * the loopback interface. Also, one must be careful to do this when * ``turning around'' packets (e.g., icmp_reflect). * * To mark a tag persistent bit-or this flag in when defining the tag id. * The tag will then be treated as described above. */ #define MTAG_PERSISTENT 0x800 #define PACKET_TAG_NONE 0 /* Nadda */ /* Packet tags for use with PACKET_ABI_COMPAT. */ #define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ #define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ #define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ #define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ #define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ #define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ #define PACKET_TAG_GIF 8 /* GIF processing done */ #define PACKET_TAG_GRE 9 /* GRE processing done */ #define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ #define PACKET_TAG_ENCAP 11 /* Encap. processing */ #define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ #define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ #define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ #define PACKET_TAG_DUMMYNET 15 /* dummynet info */ #define PACKET_TAG_DIVERT 17 /* divert info */ #define PACKET_TAG_IPFORWARD 18 /* ipforward info */ #define PACKET_TAG_MACLABEL (19 | MTAG_PERSISTENT) /* MAC label */ #define PACKET_TAG_PF (21 | MTAG_PERSISTENT) /* PF/ALTQ information */ #define PACKET_TAG_RTSOCKFAM 25 /* rtsock sa family */ #define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ #define PACKET_TAG_CARP 28 /* CARP info */ #define PACKET_TAG_IPSEC_NAT_T_PORTS 29 /* two uint16_t */ #define PACKET_TAG_ND_OUTGOING 30 /* ND outgoing */ /* Specific cookies and tags. */ /* Packet tag routines. */ struct m_tag *m_tag_alloc(u_int32_t, int, int, int); void m_tag_delete(struct mbuf *, struct m_tag *); void m_tag_delete_chain(struct mbuf *, struct m_tag *); void m_tag_free_default(struct m_tag *); struct m_tag *m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *); struct m_tag *m_tag_copy(struct m_tag *, int); int m_tag_copy_chain(struct mbuf *, const struct mbuf *, int); void m_tag_delete_nonpersistent(struct mbuf *); /* * Initialize the list of tags associated with an mbuf. */ static __inline void m_tag_init(struct mbuf *m) { SLIST_INIT(&m->m_pkthdr.tags); } /* * Set up the contents of a tag. Note that this does not fill in the free * method; the caller is expected to do that. * * XXX probably should be called m_tag_init, but that was already taken. */ static __inline void m_tag_setup(struct m_tag *t, u_int32_t cookie, int type, int len) { t->m_tag_id = type; t->m_tag_len = len; t->m_tag_cookie = cookie; } /* * Reclaim resources associated with a tag. */ static __inline void m_tag_free(struct m_tag *t) { (*t->m_tag_free)(t); } /* * Return the first tag associated with an mbuf. */ static __inline struct m_tag * m_tag_first(struct mbuf *m) { return (SLIST_FIRST(&m->m_pkthdr.tags)); } /* * Return the next tag in the list of tags associated with an mbuf. */ static __inline struct m_tag * m_tag_next(struct mbuf *m __unused, struct m_tag *t) { return (SLIST_NEXT(t, m_tag_link)); } /* * Prepend a tag to the list of tags associated with an mbuf. */ static __inline void m_tag_prepend(struct mbuf *m, struct m_tag *t) { SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); } /* * Unlink a tag from the list of tags associated with an mbuf. */ static __inline void m_tag_unlink(struct mbuf *m, struct m_tag *t) { SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); } /* These are for OpenBSD compatibility. */ #define MTAG_ABI_COMPAT 0 /* compatibility ABI */ static __inline struct m_tag * m_tag_get(int type, int length, int wait) { return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait)); } static __inline struct m_tag * m_tag_find(struct mbuf *m, int type, struct m_tag *start) { return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL : m_tag_locate(m, MTAG_ABI_COMPAT, type, start)); } static __inline struct mbuf * m_free(struct mbuf *m) { struct mbuf *n = m->m_next; if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE)) m_tag_delete_chain(m, NULL); if (m->m_flags & M_EXT) mb_free_ext(m); else if ((m->m_flags & M_NOFREE) == 0) uma_zfree(zone_mbuf, m); return (n); } static __inline int rt_m_getfib(struct mbuf *m) { KASSERT(m->m_flags & M_PKTHDR , ("Attempt to get FIB from non header mbuf.")); return (m->m_pkthdr.fibnum); } #define M_GETFIB(_m) rt_m_getfib(_m) #define M_SETFIB(_m, _fib) do { \ KASSERT((_m)->m_flags & M_PKTHDR, ("Attempt to set FIB on non header mbuf.")); \ ((_m)->m_pkthdr.fibnum) = (_fib); \ } while (0) /* flags passed as first argument for "m_ether_tcpip_hash()" */ #define MBUF_HASHFLAG_L2 (1 << 2) #define MBUF_HASHFLAG_L3 (1 << 3) #define MBUF_HASHFLAG_L4 (1 << 4) /* mbuf hashing helper routines */ uint32_t m_ether_tcpip_hash_init(void); uint32_t m_ether_tcpip_hash(const uint32_t, const struct mbuf *, const uint32_t); #ifdef MBUF_PROFILING void m_profile(struct mbuf *m); #define M_PROFILE(m) m_profile(m) #else #define M_PROFILE(m) #endif struct mbufq { STAILQ_HEAD(, mbuf) mq_head; int mq_len; int mq_maxlen; }; static inline void mbufq_init(struct mbufq *mq, int maxlen) { STAILQ_INIT(&mq->mq_head); mq->mq_maxlen = maxlen; mq->mq_len = 0; } static inline struct mbuf * mbufq_flush(struct mbufq *mq) { struct mbuf *m; m = STAILQ_FIRST(&mq->mq_head); STAILQ_INIT(&mq->mq_head); mq->mq_len = 0; return (m); } static inline void mbufq_drain(struct mbufq *mq) { struct mbuf *m, *n; n = mbufq_flush(mq); while ((m = n) != NULL) { n = STAILQ_NEXT(m, m_stailqpkt); m_freem(m); } } static inline struct mbuf * mbufq_first(const struct mbufq *mq) { return (STAILQ_FIRST(&mq->mq_head)); } static inline struct mbuf * mbufq_last(const struct mbufq *mq) { return (STAILQ_LAST(&mq->mq_head, mbuf, m_stailqpkt)); } static inline int mbufq_full(const struct mbufq *mq) { return (mq->mq_len >= mq->mq_maxlen); } static inline int mbufq_len(const struct mbufq *mq) { return (mq->mq_len); } static inline int mbufq_enqueue(struct mbufq *mq, struct mbuf *m) { if (mbufq_full(mq)) return (ENOBUFS); STAILQ_INSERT_TAIL(&mq->mq_head, m, m_stailqpkt); mq->mq_len++; return (0); } static inline struct mbuf * mbufq_dequeue(struct mbufq *mq) { struct mbuf *m; m = STAILQ_FIRST(&mq->mq_head); if (m) { STAILQ_REMOVE_HEAD(&mq->mq_head, m_stailqpkt); m->m_nextpkt = NULL; mq->mq_len--; } return (m); } static inline void mbufq_prepend(struct mbufq *mq, struct mbuf *m) { STAILQ_INSERT_HEAD(&mq->mq_head, m, m_stailqpkt); mq->mq_len++; } #endif /* _KERNEL */ #endif /* !_SYS_MBUF_H_ */ Index: head/sys/sys/sf_buf.h =================================================================== --- head/sys/sys/sf_buf.h (revision 293438) +++ head/sys/sys/sf_buf.h (revision 293439) @@ -1,196 +1,203 @@ /*- * Copyright (c) 2014 Gleb Smirnoff * Copyright (c) 2003-2004 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_SF_BUF_H_ #define _SYS_SF_BUF_H_ struct sfstat { /* sendfile statistics */ + uint64_t sf_syscalls; /* times sendfile was called */ + uint64_t sf_noiocnt; /* times sendfile didn't require I/O */ uint64_t sf_iocnt; /* times sendfile had to do disk I/O */ + uint64_t sf_pages_read; /* pages read as part of a request */ + uint64_t sf_pages_valid; /* pages were valid for a request */ + uint64_t sf_rhpages_requested; /* readahead pages requested */ + uint64_t sf_rhpages_read; /* readahead pages read */ + uint64_t sf_busy; /* times aborted on a busy page */ uint64_t sf_allocfail; /* times sfbuf allocation failed */ uint64_t sf_allocwait; /* times sfbuf allocation had to wait */ }; #ifdef _KERNEL #include #include #include #include #include #include /* * Sf_bufs, or sendfile(2) buffers provide a vm_page that is mapped * into kernel address space. Note, that they aren't used only * by sendfile(2)! * * Sf_bufs could be implemented as a feature of vm_page_t, but that * would require growth of the structure. That's why they are implemented * as a separate hash indexed by vm_page address. Implementation lives in * kern/subr_sfbuf.c. Meanwhile, most 64-bit machines have a physical map, * so they don't require this hash at all, thus ignore subr_sfbuf.c. * * Different 32-bit architectures demand different requirements on sf_buf * hash and functions. They request features in machine/vmparam.h, which * enable parts of this file. They can also optionally provide helpers in * machine/sf_buf.h * * Defines are: * SFBUF This machine requires sf_buf hash. * subr_sfbuf.c should be compiled. * SFBUF_CPUSET This machine can perform SFB_CPUPRIVATE mappings, * that do no invalidate cache on the rest of CPUs. * SFBUF_NOMD This machine doesn't have machine/sf_buf.h * * SFBUF_OPTIONAL_DIRECT_MAP Value of this define is used as boolean * variable that tells whether machine is * capable of direct map or not at runtime. * SFBUF_MAP This machine provides its own sf_buf_map() and * sf_buf_unmap(). * SFBUF_PROCESS_PAGE This machine provides sf_buf_process_page() * function. */ #ifdef SFBUF #if defined(SMP) && defined(SFBUF_CPUSET) #include #endif #include struct sf_buf { LIST_ENTRY(sf_buf) list_entry; /* list of buffers */ TAILQ_ENTRY(sf_buf) free_entry; /* list of buffers */ vm_page_t m; /* currently mapped page */ vm_offset_t kva; /* va of mapping */ int ref_count; /* usage of this mapping */ #if defined(SMP) && defined(SFBUF_CPUSET) cpuset_t cpumask; /* where mapping is valid */ #endif }; #else /* ! SFBUF */ struct sf_buf; #endif /* SFBUF */ #ifndef SFBUF_NOMD #include #endif #ifdef SFBUF_OPTIONAL_DIRECT_MAP #include #endif #ifdef SFBUF struct sf_buf *sf_buf_alloc(struct vm_page *, int); void sf_buf_free(struct sf_buf *); void sf_buf_ref(struct sf_buf *); static inline vm_offset_t sf_buf_kva(struct sf_buf *sf) { #ifdef SFBUF_OPTIONAL_DIRECT_MAP if (SFBUF_OPTIONAL_DIRECT_MAP) return (SFBUF_PHYS_DMAP(VM_PAGE_TO_PHYS((vm_page_t)sf))); #endif return (sf->kva); } static inline vm_page_t sf_buf_page(struct sf_buf *sf) { #ifdef SFBUF_OPTIONAL_DIRECT_MAP if (SFBUF_OPTIONAL_DIRECT_MAP) return ((vm_page_t)sf); #endif return (sf->m); } #ifndef SFBUF_MAP #include static inline void sf_buf_map(struct sf_buf *sf, int flags) { pmap_qenter(sf->kva, &sf->m, 1); } static inline int sf_buf_unmap(struct sf_buf *sf) { return (0); } #endif /* SFBUF_MAP */ #if defined(SMP) && defined(SFBUF_CPUSET) void sf_buf_shootdown(struct sf_buf *, int); #endif #ifdef SFBUF_PROCESS_PAGE boolean_t sf_buf_process_page(vm_page_t, void (*)(struct sf_buf *)); #endif #else /* ! SFBUF */ static inline struct sf_buf * sf_buf_alloc(struct vm_page *m, int pri) { return ((struct sf_buf *)m); } static inline void sf_buf_free(struct sf_buf *sf) { } static inline void sf_buf_ref(struct sf_buf *sf) { } #endif /* SFBUF */ /* * Options to sf_buf_alloc() are specified through its flags argument. This * argument's value should be the result of a bitwise or'ing of one or more * of the following values. */ #define SFB_CATCH 1 /* Check signals if the allocation sleeps. */ #define SFB_CPUPRIVATE 2 /* Create a CPU private mapping. */ #define SFB_DEFAULT 0 #define SFB_NOWAIT 4 /* Return NULL if all bufs are used. */ extern counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; #define SFSTAT_ADD(name, val) \ counter_u64_add(sfstat[offsetof(struct sfstat, name) / sizeof(uint64_t)],\ (val)) #define SFSTAT_INC(name) SFSTAT_ADD(name, 1) #endif /* _KERNEL */ #endif /* !_SYS_SF_BUF_H_ */ Index: head/sys/sys/socket.h =================================================================== --- head/sys/sys/socket.h (revision 293438) +++ head/sys/sys/socket.h (revision 293439) @@ -1,672 +1,675 @@ /*- * Copyright (c) 1982, 1985, 1986, 1988, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)socket.h 8.4 (Berkeley) 2/21/94 * $FreeBSD$ */ #ifndef _SYS_SOCKET_H_ #define _SYS_SOCKET_H_ #include #include #include #include /* * Definitions related to sockets: types, address families, options. */ /* * Data types. */ #if __BSD_VISIBLE #ifndef _GID_T_DECLARED typedef __gid_t gid_t; #define _GID_T_DECLARED #endif #ifndef _OFF_T_DECLARED typedef __off_t off_t; #define _OFF_T_DECLARED #endif #ifndef _PID_T_DECLARED typedef __pid_t pid_t; #define _PID_T_DECLARED #endif #endif #ifndef _SA_FAMILY_T_DECLARED typedef __sa_family_t sa_family_t; #define _SA_FAMILY_T_DECLARED #endif #ifndef _SOCKLEN_T_DECLARED typedef __socklen_t socklen_t; #define _SOCKLEN_T_DECLARED #endif #ifndef _SSIZE_T_DECLARED typedef __ssize_t ssize_t; #define _SSIZE_T_DECLARED #endif #if __BSD_VISIBLE #ifndef _UID_T_DECLARED typedef __uid_t uid_t; #define _UID_T_DECLARED #endif #endif #ifndef _UINT32_T_DECLARED typedef __uint32_t uint32_t; #define _UINT32_T_DECLARED #endif #ifndef _UINTPTR_T_DECLARED typedef __uintptr_t uintptr_t; #define _UINTPTR_T_DECLARED #endif /* * Types */ #define SOCK_STREAM 1 /* stream socket */ #define SOCK_DGRAM 2 /* datagram socket */ #define SOCK_RAW 3 /* raw-protocol interface */ #if __BSD_VISIBLE #define SOCK_RDM 4 /* reliably-delivered message */ #endif #define SOCK_SEQPACKET 5 /* sequenced packet stream */ #if __BSD_VISIBLE /* * Creation flags, OR'ed into socket() and socketpair() type argument. */ #define SOCK_CLOEXEC 0x10000000 #define SOCK_NONBLOCK 0x20000000 #endif /* * Option flags per-socket. */ #define SO_DEBUG 0x0001 /* turn on debugging info recording */ #define SO_ACCEPTCONN 0x0002 /* socket has had listen() */ #define SO_REUSEADDR 0x0004 /* allow local address reuse */ #define SO_KEEPALIVE 0x0008 /* keep connections alive */ #define SO_DONTROUTE 0x0010 /* just use interface addresses */ #define SO_BROADCAST 0x0020 /* permit sending of broadcast msgs */ #if __BSD_VISIBLE #define SO_USELOOPBACK 0x0040 /* bypass hardware when possible */ #endif #define SO_LINGER 0x0080 /* linger on close if data present */ #define SO_OOBINLINE 0x0100 /* leave received OOB data in line */ #if __BSD_VISIBLE #define SO_REUSEPORT 0x0200 /* allow local address & port reuse */ #define SO_TIMESTAMP 0x0400 /* timestamp received dgram traffic */ #define SO_NOSIGPIPE 0x0800 /* no SIGPIPE from EPIPE */ #define SO_ACCEPTFILTER 0x1000 /* there is an accept filter */ #define SO_BINTIME 0x2000 /* timestamp received dgram traffic */ #endif #define SO_NO_OFFLOAD 0x4000 /* socket cannot be offloaded */ #define SO_NO_DDP 0x8000 /* disable direct data placement */ /* * Additional options, not kept in so_options. */ #define SO_SNDBUF 0x1001 /* send buffer size */ #define SO_RCVBUF 0x1002 /* receive buffer size */ #define SO_SNDLOWAT 0x1003 /* send low-water mark */ #define SO_RCVLOWAT 0x1004 /* receive low-water mark */ #define SO_SNDTIMEO 0x1005 /* send timeout */ #define SO_RCVTIMEO 0x1006 /* receive timeout */ #define SO_ERROR 0x1007 /* get error status and clear */ #define SO_TYPE 0x1008 /* get socket type */ #if __BSD_VISIBLE #define SO_LABEL 0x1009 /* socket's MAC label */ #define SO_PEERLABEL 0x1010 /* socket's peer's MAC label */ #define SO_LISTENQLIMIT 0x1011 /* socket's backlog limit */ #define SO_LISTENQLEN 0x1012 /* socket's complete queue length */ #define SO_LISTENINCQLEN 0x1013 /* socket's incomplete queue length */ #define SO_SETFIB 0x1014 /* use this FIB to route */ #define SO_USER_COOKIE 0x1015 /* user cookie (dummynet etc.) */ #define SO_PROTOCOL 0x1016 /* get socket protocol (Linux name) */ #define SO_PROTOTYPE SO_PROTOCOL /* alias for SO_PROTOCOL (SunOS name) */ #endif /* * Space reserved for new socket options added by third-party vendors. * This range applies to all socket option levels. New socket options * in FreeBSD should always use an option value less than SO_VENDOR. */ #if __BSD_VISIBLE #define SO_VENDOR 0x80000000 #endif /* * Structure used for manipulating linger option. */ struct linger { int l_onoff; /* option on/off */ int l_linger; /* linger time */ }; #if __BSD_VISIBLE struct accept_filter_arg { char af_name[16]; char af_arg[256-16]; }; #endif /* * Level number for (get/set)sockopt() to apply to socket itself. */ #define SOL_SOCKET 0xffff /* options for socket level */ /* * Address families. */ #define AF_UNSPEC 0 /* unspecified */ #if __BSD_VISIBLE #define AF_LOCAL AF_UNIX /* local to host (pipes, portals) */ #endif #define AF_UNIX 1 /* standardized name for AF_LOCAL */ #define AF_INET 2 /* internetwork: UDP, TCP, etc. */ #if __BSD_VISIBLE #define AF_IMPLINK 3 /* arpanet imp addresses */ #define AF_PUP 4 /* pup protocols: e.g. BSP */ #define AF_CHAOS 5 /* mit CHAOS protocols */ #define AF_NETBIOS 6 /* SMB protocols */ #define AF_ISO 7 /* ISO protocols */ #define AF_OSI AF_ISO #define AF_ECMA 8 /* European computer manufacturers */ #define AF_DATAKIT 9 /* datakit protocols */ #define AF_CCITT 10 /* CCITT protocols, X.25 etc */ #define AF_SNA 11 /* IBM SNA */ #define AF_DECnet 12 /* DECnet */ #define AF_DLI 13 /* DEC Direct data link interface */ #define AF_LAT 14 /* LAT */ #define AF_HYLINK 15 /* NSC Hyperchannel */ #define AF_APPLETALK 16 /* Apple Talk */ #define AF_ROUTE 17 /* Internal Routing Protocol */ #define AF_LINK 18 /* Link layer interface */ #define pseudo_AF_XTP 19 /* eXpress Transfer Protocol (no AF) */ #define AF_COIP 20 /* connection-oriented IP, aka ST II */ #define AF_CNT 21 /* Computer Network Technology */ #define pseudo_AF_RTIP 22 /* Help Identify RTIP packets */ #define AF_IPX 23 /* Novell Internet Protocol */ #define AF_SIP 24 /* Simple Internet Protocol */ #define pseudo_AF_PIP 25 /* Help Identify PIP packets */ #define AF_ISDN 26 /* Integrated Services Digital Network*/ #define AF_E164 AF_ISDN /* CCITT E.164 recommendation */ #define pseudo_AF_KEY 27 /* Internal key-management function */ #endif #define AF_INET6 28 /* IPv6 */ #if __BSD_VISIBLE #define AF_NATM 29 /* native ATM access */ #define AF_ATM 30 /* ATM */ #define pseudo_AF_HDRCMPLT 31 /* Used by BPF to not rewrite headers * in interface output routine */ #define AF_NETGRAPH 32 /* Netgraph sockets */ #define AF_SLOW 33 /* 802.3ad slow protocol */ #define AF_SCLUSTER 34 /* Sitara cluster protocol */ #define AF_ARP 35 #define AF_BLUETOOTH 36 /* Bluetooth sockets */ #define AF_IEEE80211 37 /* IEEE 802.11 protocol */ #define AF_INET_SDP 40 /* OFED Socket Direct Protocol ipv4 */ #define AF_INET6_SDP 42 /* OFED Socket Direct Protocol ipv6 */ #define AF_MAX 42 /* * When allocating a new AF_ constant, please only allocate * even numbered constants for FreeBSD until 134 as odd numbered AF_ * constants 39-133 are now reserved for vendors. */ #define AF_VENDOR00 39 #define AF_VENDOR01 41 #define AF_VENDOR02 43 #define AF_VENDOR03 45 #define AF_VENDOR04 47 #define AF_VENDOR05 49 #define AF_VENDOR06 51 #define AF_VENDOR07 53 #define AF_VENDOR08 55 #define AF_VENDOR09 57 #define AF_VENDOR10 59 #define AF_VENDOR11 61 #define AF_VENDOR12 63 #define AF_VENDOR13 65 #define AF_VENDOR14 67 #define AF_VENDOR15 69 #define AF_VENDOR16 71 #define AF_VENDOR17 73 #define AF_VENDOR18 75 #define AF_VENDOR19 77 #define AF_VENDOR20 79 #define AF_VENDOR21 81 #define AF_VENDOR22 83 #define AF_VENDOR23 85 #define AF_VENDOR24 87 #define AF_VENDOR25 89 #define AF_VENDOR26 91 #define AF_VENDOR27 93 #define AF_VENDOR28 95 #define AF_VENDOR29 97 #define AF_VENDOR30 99 #define AF_VENDOR31 101 #define AF_VENDOR32 103 #define AF_VENDOR33 105 #define AF_VENDOR34 107 #define AF_VENDOR35 109 #define AF_VENDOR36 111 #define AF_VENDOR37 113 #define AF_VENDOR38 115 #define AF_VENDOR39 117 #define AF_VENDOR40 119 #define AF_VENDOR41 121 #define AF_VENDOR42 123 #define AF_VENDOR43 125 #define AF_VENDOR44 127 #define AF_VENDOR45 129 #define AF_VENDOR46 131 #define AF_VENDOR47 133 #endif /* * Structure used by kernel to store most * addresses. */ struct sockaddr { unsigned char sa_len; /* total length */ sa_family_t sa_family; /* address family */ char sa_data[14]; /* actually longer; address value */ }; #if __BSD_VISIBLE #define SOCK_MAXADDRLEN 255 /* longest possible addresses */ /* * Structure used by kernel to pass protocol * information in raw sockets. */ struct sockproto { unsigned short sp_family; /* address family */ unsigned short sp_protocol; /* protocol */ }; #endif #include #if __BSD_VISIBLE /* * Protocol families, same as address families for now. */ #define PF_UNSPEC AF_UNSPEC #define PF_LOCAL AF_LOCAL #define PF_UNIX PF_LOCAL /* backward compatibility */ #define PF_INET AF_INET #define PF_IMPLINK AF_IMPLINK #define PF_PUP AF_PUP #define PF_CHAOS AF_CHAOS #define PF_NETBIOS AF_NETBIOS #define PF_ISO AF_ISO #define PF_OSI AF_ISO #define PF_ECMA AF_ECMA #define PF_DATAKIT AF_DATAKIT #define PF_CCITT AF_CCITT #define PF_SNA AF_SNA #define PF_DECnet AF_DECnet #define PF_DLI AF_DLI #define PF_LAT AF_LAT #define PF_HYLINK AF_HYLINK #define PF_APPLETALK AF_APPLETALK #define PF_ROUTE AF_ROUTE #define PF_LINK AF_LINK #define PF_XTP pseudo_AF_XTP /* really just proto family, no AF */ #define PF_COIP AF_COIP #define PF_CNT AF_CNT #define PF_SIP AF_SIP #define PF_IPX AF_IPX #define PF_RTIP pseudo_AF_RTIP /* same format as AF_INET */ #define PF_PIP pseudo_AF_PIP #define PF_ISDN AF_ISDN #define PF_KEY pseudo_AF_KEY #define PF_INET6 AF_INET6 #define PF_NATM AF_NATM #define PF_ATM AF_ATM #define PF_NETGRAPH AF_NETGRAPH #define PF_SLOW AF_SLOW #define PF_SCLUSTER AF_SCLUSTER #define PF_ARP AF_ARP #define PF_BLUETOOTH AF_BLUETOOTH #define PF_IEEE80211 AF_IEEE80211 #define PF_INET_SDP AF_INET_SDP #define PF_INET6_SDP AF_INET6_SDP #define PF_MAX AF_MAX /* * Definitions for network related sysctl, CTL_NET. * * Second level is protocol family. * Third level is protocol number. * * Further levels are defined by the individual families. */ /* * PF_ROUTE - Routing table * * Three additional levels are defined: * Fourth: address family, 0 is wildcard * Fifth: type of info, defined below * Sixth: flag(s) to mask with for NET_RT_FLAGS */ #define NET_RT_DUMP 1 /* dump; may limit to a.f. */ #define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ #define NET_RT_IFLIST 3 /* survey interface list */ #define NET_RT_IFMALIST 4 /* return multicast address list */ #define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en * versions of msghdr structs. */ #endif /* __BSD_VISIBLE */ /* * Maximum queue length specifiable by listen. */ #define SOMAXCONN 128 /* * Message header for recvmsg and sendmsg calls. * Used value-result for recvmsg, value only for sendmsg. */ struct msghdr { void *msg_name; /* optional address */ socklen_t msg_namelen; /* size of address */ struct iovec *msg_iov; /* scatter/gather array */ int msg_iovlen; /* # elements in msg_iov */ void *msg_control; /* ancillary data, see below */ socklen_t msg_controllen; /* ancillary data buffer len */ int msg_flags; /* flags on received message */ }; #define MSG_OOB 0x1 /* process out-of-band data */ #define MSG_PEEK 0x2 /* peek at incoming message */ #define MSG_DONTROUTE 0x4 /* send without using routing tables */ #define MSG_EOR 0x8 /* data completes record */ #define MSG_TRUNC 0x10 /* data discarded before delivery */ #define MSG_CTRUNC 0x20 /* control data lost before delivery */ #define MSG_WAITALL 0x40 /* wait for full request or error */ #if __POSIX_VISIBLE >= 200809 #define MSG_NOSIGNAL 0x20000 /* do not generate SIGPIPE on EOF */ #endif #if __BSD_VISIBLE #define MSG_DONTWAIT 0x80 /* this message should be nonblocking */ #define MSG_EOF 0x100 /* data completes connection */ #define MSG_NOTIFICATION 0x2000 /* SCTP notification */ #define MSG_NBIO 0x4000 /* FIONBIO mode, used by fifofs */ #define MSG_COMPAT 0x8000 /* used in sendit() */ #define MSG_CMSG_CLOEXEC 0x40000 /* make received fds close-on-exec */ #endif #ifdef _KERNEL #define MSG_SOCALLBCK 0x10000 /* for use by socket callbacks - soreceive (TCP) */ #endif /* * Header for ancillary data objects in msg_control buffer. * Used for additional information with/about a datagram * not expressible by flags. The format is a sequence * of message elements headed by cmsghdr structures. */ struct cmsghdr { socklen_t cmsg_len; /* data byte count, including hdr */ int cmsg_level; /* originating protocol */ int cmsg_type; /* protocol-specific type */ /* followed by u_char cmsg_data[]; */ }; #if __BSD_VISIBLE /* * While we may have more groups than this, the cmsgcred struct must * be able to fit in an mbuf and we have historically supported a * maximum of 16 groups. */ #define CMGROUP_MAX 16 /* * Credentials structure, used to verify the identity of a peer * process that has sent us a message. This is allocated by the * peer process but filled in by the kernel. This prevents the * peer from lying about its identity. (Note that cmcred_groups[0] * is the effective GID.) */ struct cmsgcred { pid_t cmcred_pid; /* PID of sending process */ uid_t cmcred_uid; /* real UID of sending process */ uid_t cmcred_euid; /* effective UID of sending process */ gid_t cmcred_gid; /* real GID of sending process */ short cmcred_ngroups; /* number or groups */ gid_t cmcred_groups[CMGROUP_MAX]; /* groups */ }; /* * Socket credentials. */ struct sockcred { uid_t sc_uid; /* real user id */ uid_t sc_euid; /* effective user id */ gid_t sc_gid; /* real group id */ gid_t sc_egid; /* effective group id */ int sc_ngroups; /* number of supplemental groups */ gid_t sc_groups[1]; /* variable length */ }; /* * Compute size of a sockcred structure with groups. */ #define SOCKCREDSIZE(ngrps) \ (sizeof(struct sockcred) + (sizeof(gid_t) * ((ngrps) - 1))) #endif /* __BSD_VISIBLE */ /* given pointer to struct cmsghdr, return pointer to data */ #define CMSG_DATA(cmsg) ((unsigned char *)(cmsg) + \ _ALIGN(sizeof(struct cmsghdr))) /* given pointer to struct cmsghdr, return pointer to next cmsghdr */ #define CMSG_NXTHDR(mhdr, cmsg) \ ((char *)(cmsg) == NULL ? CMSG_FIRSTHDR(mhdr) : \ ((char *)(cmsg) + _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len) + \ _ALIGN(sizeof(struct cmsghdr)) > \ (char *)(mhdr)->msg_control + (mhdr)->msg_controllen) ? \ (struct cmsghdr *)0 : \ (struct cmsghdr *)(void *)((char *)(cmsg) + \ _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len))) /* * RFC 2292 requires to check msg_controllen, in case that the kernel returns * an empty list for some reasons. */ #define CMSG_FIRSTHDR(mhdr) \ ((mhdr)->msg_controllen >= sizeof(struct cmsghdr) ? \ (struct cmsghdr *)(mhdr)->msg_control : \ (struct cmsghdr *)NULL) #if __BSD_VISIBLE /* RFC 2292 additions */ #define CMSG_SPACE(l) (_ALIGN(sizeof(struct cmsghdr)) + _ALIGN(l)) #define CMSG_LEN(l) (_ALIGN(sizeof(struct cmsghdr)) + (l)) #endif #ifdef _KERNEL #define CMSG_ALIGN(n) _ALIGN(n) #endif /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x01 /* access rights (array of int) */ #if __BSD_VISIBLE #define SCM_TIMESTAMP 0x02 /* timestamp (struct timeval) */ #define SCM_CREDS 0x03 /* process creds (struct cmsgcred) */ #define SCM_BINTIME 0x04 /* timestamp (struct bintime) */ #endif #if __BSD_VISIBLE /* * 4.3 compat sockaddr, move to compat file later */ struct osockaddr { unsigned short sa_family; /* address family */ char sa_data[14]; /* up to 14 bytes of direct address */ }; /* * 4.3-compat message header (move to compat file later). */ struct omsghdr { char *msg_name; /* optional address */ int msg_namelen; /* size of address */ struct iovec *msg_iov; /* scatter/gather array */ int msg_iovlen; /* # elements in msg_iov */ char *msg_accrights; /* access rights sent/received */ int msg_accrightslen; }; #endif /* * howto arguments for shutdown(2), specified by Posix.1g. */ #define SHUT_RD 0 /* shut down the reading side */ #define SHUT_WR 1 /* shut down the writing side */ #define SHUT_RDWR 2 /* shut down both sides */ #if __BSD_VISIBLE /* for SCTP */ /* we cheat and use the SHUT_XX defines for these */ #define PRU_FLUSH_RD SHUT_RD #define PRU_FLUSH_WR SHUT_WR #define PRU_FLUSH_RDWR SHUT_RDWR #endif #if __BSD_VISIBLE /* * sendfile(2) header/trailer struct */ struct sf_hdtr { struct iovec *headers; /* pointer to an array of header struct iovec's */ int hdr_cnt; /* number of header iovec's */ struct iovec *trailers; /* pointer to an array of trailer struct iovec's */ int trl_cnt; /* number of trailer iovec's */ }; /* * Sendfile-specific flag(s) */ #define SF_NODISKIO 0x00000001 -#define SF_MNOWAIT 0x00000002 +#define SF_MNOWAIT 0x00000002 /* obsolete */ #define SF_SYNC 0x00000004 +#define SF_NOCACHE 0x00000010 +#define SF_FLAGS(rh, flags) (((rh) << 16) | (flags)) #ifdef _KERNEL #define SFK_COMPAT 0x00000001 +#define SF_READAHEAD(flags) ((flags) >> 16) #endif /* _KERNEL */ #endif /* __BSD_VISIBLE */ #ifndef _KERNEL #include __BEGIN_DECLS int accept(int, struct sockaddr * __restrict, socklen_t * __restrict); int bind(int, const struct sockaddr *, socklen_t); int connect(int, const struct sockaddr *, socklen_t); #if __BSD_VISIBLE int accept4(int, struct sockaddr * __restrict, socklen_t * __restrict, int); int bindat(int, int, const struct sockaddr *, socklen_t); int connectat(int, int, const struct sockaddr *, socklen_t); #endif int getpeername(int, struct sockaddr * __restrict, socklen_t * __restrict); int getsockname(int, struct sockaddr * __restrict, socklen_t * __restrict); int getsockopt(int, int, int, void * __restrict, socklen_t * __restrict); int listen(int, int); ssize_t recv(int, void *, size_t, int); ssize_t recvfrom(int, void *, size_t, int, struct sockaddr * __restrict, socklen_t * __restrict); ssize_t recvmsg(int, struct msghdr *, int); ssize_t send(int, const void *, size_t, int); ssize_t sendto(int, const void *, size_t, int, const struct sockaddr *, socklen_t); ssize_t sendmsg(int, const struct msghdr *, int); #if __BSD_VISIBLE int sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int); int setfib(int); #endif int setsockopt(int, int, int, const void *, socklen_t); int shutdown(int, int); int sockatmark(int); int socket(int, int, int); int socketpair(int, int, int, int *); __END_DECLS #endif /* !_KERNEL */ #ifdef _KERNEL struct socket; struct tcpcb *so_sototcpcb(struct socket *so); struct inpcb *so_sotoinpcb(struct socket *so); struct sockbuf *so_sockbuf_snd(struct socket *); struct sockbuf *so_sockbuf_rcv(struct socket *); int so_state_get(const struct socket *); void so_state_set(struct socket *, int); int so_options_get(const struct socket *); void so_options_set(struct socket *, int); int so_error_get(const struct socket *); void so_error_set(struct socket *, int); int so_linger_get(const struct socket *); void so_linger_set(struct socket *, int); struct protosw *so_protosw_get(const struct socket *); void so_protosw_set(struct socket *, struct protosw *); void so_sorwakeup_locked(struct socket *so); void so_sowwakeup_locked(struct socket *so); void so_sorwakeup(struct socket *so); void so_sowwakeup(struct socket *so); void so_lock(struct socket *so); void so_unlock(struct socket *so); void so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg); #endif #endif /* !_SYS_SOCKET_H_ */ Index: head/usr.bin/netstat/mbuf.c =================================================================== --- head/usr.bin/netstat/mbuf.c (revision 293438) +++ head/usr.bin/netstat/mbuf.c (revision 293439) @@ -1,339 +1,359 @@ /*- * Copyright (c) 1983, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2005 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static char sccsid[] = "@(#)mbuf.c 8.1 (Berkeley) 6/6/93"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "netstat.h" /* * Print mbuf statistics. */ void mbpr(void *kvmd, u_long mbaddr) { struct memory_type_list *mtlp; struct memory_type *mtp; uintmax_t mbuf_count, mbuf_bytes, mbuf_free, mbuf_failures, mbuf_size; uintmax_t mbuf_sleeps; uintmax_t cluster_count, cluster_limit, cluster_free; uintmax_t cluster_failures, cluster_size, cluster_sleeps; uintmax_t packet_count, packet_bytes, packet_free, packet_failures; uintmax_t packet_sleeps; uintmax_t tag_bytes; uintmax_t jumbop_count, jumbop_limit, jumbop_free; uintmax_t jumbop_failures, jumbop_sleeps, jumbop_size; uintmax_t jumbo9_count, jumbo9_limit, jumbo9_free; uintmax_t jumbo9_failures, jumbo9_sleeps, jumbo9_size; uintmax_t jumbo16_count, jumbo16_limit, jumbo16_free; uintmax_t jumbo16_failures, jumbo16_sleeps, jumbo16_size; uintmax_t bytes_inuse, bytes_incache, bytes_total; int nsfbufs, nsfbufspeak, nsfbufsused; struct sfstat sfstat; size_t mlen; int error; mtlp = memstat_mtl_alloc(); if (mtlp == NULL) { xo_warn("memstat_mtl_alloc"); return; } /* * Use memstat_*_all() because some mbuf-related memory is in uma(9), * and some malloc(9). */ if (live) { if (memstat_sysctl_all(mtlp, 0) < 0) { xo_warnx("memstat_sysctl_all: %s", memstat_strerror(memstat_mtl_geterror(mtlp))); goto out; } } else { if (memstat_kvm_all(mtlp, kvmd) < 0) { error = memstat_mtl_geterror(mtlp); if (error == MEMSTAT_ERROR_KVM) xo_warnx("memstat_kvm_all: %s", kvm_geterr(kvmd)); else xo_warnx("memstat_kvm_all: %s", memstat_strerror(error)); goto out; } } mtp = memstat_mtl_find(mtlp, ALLOCATOR_UMA, MBUF_MEM_NAME); if (mtp == NULL) { xo_warnx("memstat_mtl_find: zone %s not found", MBUF_MEM_NAME); goto out; } mbuf_count = memstat_get_count(mtp); mbuf_bytes = memstat_get_bytes(mtp); mbuf_free = memstat_get_free(mtp); mbuf_failures = memstat_get_failures(mtp); mbuf_sleeps = memstat_get_sleeps(mtp); mbuf_size = memstat_get_size(mtp); mtp = memstat_mtl_find(mtlp, ALLOCATOR_UMA, MBUF_PACKET_MEM_NAME); if (mtp == NULL) { xo_warnx("memstat_mtl_find: zone %s not found", MBUF_PACKET_MEM_NAME); goto out; } packet_count = memstat_get_count(mtp); packet_bytes = memstat_get_bytes(mtp); packet_free = memstat_get_free(mtp); packet_sleeps = memstat_get_sleeps(mtp); packet_failures = memstat_get_failures(mtp); mtp = memstat_mtl_find(mtlp, ALLOCATOR_UMA, MBUF_CLUSTER_MEM_NAME); if (mtp == NULL) { xo_warnx("memstat_mtl_find: zone %s not found", MBUF_CLUSTER_MEM_NAME); goto out; } cluster_count = memstat_get_count(mtp); cluster_limit = memstat_get_countlimit(mtp); cluster_free = memstat_get_free(mtp); cluster_failures = memstat_get_failures(mtp); cluster_sleeps = memstat_get_sleeps(mtp); cluster_size = memstat_get_size(mtp); mtp = memstat_mtl_find(mtlp, ALLOCATOR_MALLOC, MBUF_TAG_MEM_NAME); if (mtp == NULL) { xo_warnx("memstat_mtl_find: malloc type %s not found", MBUF_TAG_MEM_NAME); goto out; } tag_bytes = memstat_get_bytes(mtp); mtp = memstat_mtl_find(mtlp, ALLOCATOR_UMA, MBUF_JUMBOP_MEM_NAME); if (mtp == NULL) { xo_warnx("memstat_mtl_find: zone %s not found", MBUF_JUMBOP_MEM_NAME); goto out; } jumbop_count = memstat_get_count(mtp); jumbop_limit = memstat_get_countlimit(mtp); jumbop_free = memstat_get_free(mtp); jumbop_failures = memstat_get_failures(mtp); jumbop_sleeps = memstat_get_sleeps(mtp); jumbop_size = memstat_get_size(mtp); mtp = memstat_mtl_find(mtlp, ALLOCATOR_UMA, MBUF_JUMBO9_MEM_NAME); if (mtp == NULL) { xo_warnx("memstat_mtl_find: zone %s not found", MBUF_JUMBO9_MEM_NAME); goto out; } jumbo9_count = memstat_get_count(mtp); jumbo9_limit = memstat_get_countlimit(mtp); jumbo9_free = memstat_get_free(mtp); jumbo9_failures = memstat_get_failures(mtp); jumbo9_sleeps = memstat_get_sleeps(mtp); jumbo9_size = memstat_get_size(mtp); mtp = memstat_mtl_find(mtlp, ALLOCATOR_UMA, MBUF_JUMBO16_MEM_NAME); if (mtp == NULL) { xo_warnx("memstat_mtl_find: zone %s not found", MBUF_JUMBO16_MEM_NAME); goto out; } jumbo16_count = memstat_get_count(mtp); jumbo16_limit = memstat_get_countlimit(mtp); jumbo16_free = memstat_get_free(mtp); jumbo16_failures = memstat_get_failures(mtp); jumbo16_sleeps = memstat_get_sleeps(mtp); jumbo16_size = memstat_get_size(mtp); xo_open_container("mbuf-statistics"); xo_emit("{:mbuf-current/%ju}/{:mbuf-cache/%ju}/{:mbuf-total/%ju} " "{N:mbufs in use (current\\/cache\\/total)}\n", mbuf_count + packet_count, mbuf_free + packet_free, mbuf_count + packet_count + mbuf_free + packet_free); xo_emit("{:cluster-current/%ju}/{:cluster-cache/%ju}/" "{:cluster-total/%ju}/{:cluster-max/%ju} " "{N:mbuf clusters in use (current\\/cache\\/total\\/max)}\n", cluster_count - packet_free, cluster_free + packet_free, cluster_count + cluster_free, cluster_limit); xo_emit("{:packet-count/%ju}/{:packet-free/%ju} " "{N:mbuf+clusters out of packet secondary zone in use " "(current\\/cache)}\n", packet_count, packet_free); xo_emit("{:jumbo-count/%ju}/{:jumbo-cache/%ju}/{:jumbo-total/%ju}/" "{:jumbo-max/%ju} {:jumbo-page-size/%ju}{U:k} {N:(page size)} " "{N:jumbo clusters in use (current\\/cache\\/total\\/max)}\n", jumbop_count, jumbop_free, jumbop_count + jumbop_free, jumbop_limit, jumbop_size / 1024); xo_emit("{:jumbo9-count/%ju}/{:jumbo9-cache/%ju}/" "{:jumbo9-total/%ju}/{:jumbo9-max/%ju} " "{N:9k jumbo clusters in use (current\\/cache\\/total\\/max)}\n", jumbo9_count, jumbo9_free, jumbo9_count + jumbo9_free, jumbo9_limit); xo_emit("{:jumbo16-count/%ju}/{:jumbo16-cache/%ju}/" "{:jumbo16-total/%ju}/{:jumbo16-limit/%ju} " "{N:16k jumbo clusters in use (current\\/cache\\/total\\/max)}\n", jumbo16_count, jumbo16_free, jumbo16_count + jumbo16_free, jumbo16_limit); #if 0 xo_emit("{:tag-count/%ju} {N:mbuf tags in use}\n", tag_count); #endif /*- * Calculate in-use bytes as: * - straight mbuf memory * - mbuf memory in packets * - the clusters attached to packets * - and the rest of the non-packet-attached clusters. * - m_tag memory * This avoids counting the clusters attached to packets in the cache. * This currently excludes sf_buf space. */ bytes_inuse = mbuf_bytes + /* straight mbuf memory */ packet_bytes + /* mbufs in packets */ (packet_count * cluster_size) + /* clusters in packets */ /* other clusters */ ((cluster_count - packet_count - packet_free) * cluster_size) + tag_bytes + (jumbop_count * jumbop_size) + /* jumbo clusters */ (jumbo9_count * jumbo9_size) + (jumbo16_count * jumbo16_size); /* * Calculate in-cache bytes as: * - cached straught mbufs * - cached packet mbufs * - cached packet clusters * - cached straight clusters * This currently excludes sf_buf space. */ bytes_incache = (mbuf_free * mbuf_size) + /* straight free mbufs */ (packet_free * mbuf_size) + /* mbufs in free packets */ (packet_free * cluster_size) + /* clusters in free packets */ (cluster_free * cluster_size) + /* free clusters */ (jumbop_free * jumbop_size) + /* jumbo clusters */ (jumbo9_free * jumbo9_size) + (jumbo16_free * jumbo16_size); /* * Total is bytes in use + bytes in cache. This doesn't take into * account various other misc data structures, overhead, etc, but * gives the user something useful despite that. */ bytes_total = bytes_inuse + bytes_incache; xo_emit("{:bytes-in-use/%ju}{U:K}/{:bytes-in-cache/%ju}{U:K}/" "{:bytes-total/%ju}{U:K} " "{N:bytes allocated to network (current\\/cache\\/total)}\n", bytes_inuse / 1024, bytes_incache / 1024, bytes_total / 1024); xo_emit("{:mbuf-failures/%ju}/{:cluster-failures/%ju}/" "{:packet-failures/%ju} {N:requests for mbufs denied " "(mbufs\\/clusters\\/mbuf+clusters)}\n", mbuf_failures, cluster_failures, packet_failures); xo_emit("{:mbuf-sleeps/%ju}/{:cluster-sleeps/%ju}/{:packet-sleeps/%ju} " "{N:requests for mbufs delayed " "(mbufs\\/clusters\\/mbuf+clusters)}\n", mbuf_sleeps, cluster_sleeps, packet_sleeps); xo_emit("{:jumbop-sleeps/%ju}/{:jumbo9-sleeps/%ju}/" "{:jumbo16-sleeps/%ju} {N:/requests for jumbo clusters delayed " "(%juk\\/9k\\/16k)}\n", jumbop_sleeps, jumbo9_sleeps, jumbo16_sleeps, jumbop_size / 1024); xo_emit("{:jumbop-failures/%ju}/{:jumbo9-failures/%ju}/" "{:jumbo16-failures/%ju} {N:/requests for jumbo clusters denied " "(%juk\\/9k\\/16k)}\n", jumbop_failures, jumbo9_failures, jumbo16_failures, jumbop_size / 1024); mlen = sizeof(nsfbufs); if (live && sysctlbyname("kern.ipc.nsfbufs", &nsfbufs, &mlen, NULL, 0) == 0 && sysctlbyname("kern.ipc.nsfbufsused", &nsfbufsused, &mlen, NULL, 0) == 0 && sysctlbyname("kern.ipc.nsfbufspeak", &nsfbufspeak, &mlen, NULL, 0) == 0) xo_emit("{:nsfbufs-current/%d}/{:nsfbufs-peak/%d}/" "{:nsfbufs/%d} " "{N:sfbufs in use (current\\/peak\\/max)}\n", nsfbufsused, nsfbufspeak, nsfbufs); if (fetch_stats("kern.ipc.sfstat", mbaddr, &sfstat, sizeof(sfstat), kread_counters) != 0) goto out; + xo_emit("{:sendfile-syscalls/%ju} {N:sendfile syscalls}\n", + (uintmax_t)sfstat.sf_syscalls); + xo_emit("{:sendfile-no-io/%ju} " + "{N:sendfile syscalls completed without I\\/O request}\n", + (uintmax_t)sfstat.sf_noiocnt); + xo_emit("{:sendfile-io-count/%ju} " + "{N:requests for I\\/O initiated by sendfile}\n", + (uintmax_t)sfstat.sf_iocnt); + xo_emit("{:sendfile-pages-sent/%ju} " + "{N:pages read by sendfile as part of a request}\n", + (uintmax_t)sfstat.sf_pages_read); + xo_emit("{:sendfile-pages-valid/%ju} " + "{N:pages were valid at time of a sendfile request}\n", + (uintmax_t)sfstat.sf_pages_valid); + xo_emit("{:sendfile-requested-readahead/%ju} " + "{N:pages were requested for read ahead by applications}\n", + (uintmax_t)sfstat.sf_rhpages_requested); + xo_emit("{:sendfile-readahead/%ju} " + "{N:pages were read ahead by sendfile}\n", + (uintmax_t)sfstat.sf_rhpages_read); + xo_emit("{:sendfile-busy-encounters/%ju} " + "{N:times sendfile encountered an already busy page}\n", + (uintmax_t)sfstat.sf_busy); xo_emit("{:sfbufs-alloc-failed/%ju} {N:requests for sfbufs denied}\n", (uintmax_t)sfstat.sf_allocfail); xo_emit("{:sfbufs-alloc-wait/%ju} {N:requests for sfbufs delayed}\n", (uintmax_t)sfstat.sf_allocwait); - xo_emit("{:sfbufs-io-count/%ju} " - "{N:requests for I\\/O initiated by sendfile}\n", - (uintmax_t)sfstat.sf_iocnt); out: xo_close_container("mbuf-statistics"); memstat_mtl_free(mtlp); }