diff --git a/sys/kern/subr_mbuf.c b/sys/kern/subr_mbuf.c index f5e8abff85c1..123553bc53dd 100644 --- a/sys/kern/subr_mbuf.c +++ b/sys/kern/subr_mbuf.c @@ -1,1595 +1,1594 @@ /*- * Copyright (c) 2001, 2002, 2003 * Bosko Milekic . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_mac.h" #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * mb_alloc: network buffer allocator */ /* * Maximum number of PCPU containers. If you know what you're doing you could * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your * system during compilation, and thus prevent kernel structure bloat. * * SMP and non-SMP kernels clearly have a different number of possible CPUs, * but because we cannot assume a dense array of CPUs, we always allocate * and traverse PCPU containers up to NCPU amount and merely check for * CPU availability. */ #ifdef MBALLOC_NCPU #define NCPU MBALLOC_NCPU #else #define NCPU MAXCPU #endif /*- * The mbuf allocator is based on Alfred Perlstein's * "memcache" proof-of-concept allocator which was itself based on * several well-known SMP-friendly allocators. * * The mb_alloc mbuf allocator is a special when compared to other * general-purpose allocators. Some things to take note of: * * Mbufs and mbuf clusters are two different objects. Sometimes we * will allocate a single mbuf, other times a single cluster, * other times both. Further, we may sometimes wish to allocate a * whole chain of mbufs with clusters. This allocator will perform * the common case of each scenario in one function call (this * includes constructing or destructing the object) while only * locking/unlocking the cache once, if it can get away with it. * The caches consist of pure mbufs and pure clusters; that is * there are no 'zones' containing mbufs with already pre-hooked * clusters. Since we can allocate both objects atomically anyway, * we don't bother fragmenting our caches for any particular 'scenarios.' * * We allocate from seperate sub-maps of kmem_map, thus imposing * an ultimate upper-limit on the number of allocatable clusters * and mbufs and also, since the clusters all come from a * virtually contiguous region, we can keep reference counters * for them and "allocate" them purely by indexing into a * dense refcount vector. * * We call out to protocol drain routines (which can be hooked * into us) when we're low on space. * * The mbuf allocator keeps all objects that it allocates in mb_buckets. * The buckets keep a number of objects (an object can be an mbuf or an * mbuf cluster) and facilitate moving larger sets of contiguous objects * from the per-CPU caches to the global cache. The buckets also have * the added advantage that objects, when migrated from cache to cache, * are migrated in chunks that keep contiguous objects together, * minimizing TLB pollution. * * The buckets are kept on singly-linked lists called "containers." A container * is protected by a mutex in order to ensure consistency. The mutex * itself is allocated separately and attached to the container at boot time, * thus allowing for certain containers to share the same lock. Per-CPU * containers for mbufs and mbuf clusters all share the same per-CPU * lock whereas the global cache containers for these objects share one * global lock. */ struct mb_bucket { SLIST_ENTRY(mb_bucket) mb_blist; int mb_owner; int mb_numfree; void *mb_free[0]; }; struct mb_container { SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead; struct mtx *mc_lock; int mc_numowner; u_int mc_starved; long *mc_types; u_long *mc_objcount; u_long *mc_numbucks; }; struct mb_gen_list { struct mb_container mb_cont; struct cv mgl_mstarved; }; struct mb_pcpu_list { struct mb_container mb_cont; }; /* * Boot-time configurable object counts that will determine the maximum * number of permitted objects in the mbuf and mcluster cases. In the * ext counter (nmbcnt) case, it's just an indicator serving to scale * kmem_map size properly - in other words, we may be allowed to allocate * more than nmbcnt counters, whereas we will never be allowed to allocate * more than nmbufs mbufs or nmbclusters mclusters. * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be * allocatable by the sfbuf allocator (found in uipc_syscalls.c) */ #ifndef NMBCLUSTERS #define NMBCLUSTERS (1024 + maxusers * 64) #endif #ifndef NMBUFS #define NMBUFS (nmbclusters * 2) #endif #ifndef NSFBUFS #define NSFBUFS (512 + maxusers * 16) #endif #ifndef NMBCNTS #define NMBCNTS (nmbclusters + nsfbufs) #endif int nmbufs; int nmbclusters; int nmbcnt; int nsfbufs; /* * Sizes of objects per bucket. There are this size's worth of mbufs * or clusters in each bucket. Please keep these a power-of-2. */ #define MBUF_BUCK_SZ (PAGE_SIZE * 2) #define CLUST_BUCK_SZ (PAGE_SIZE * 4) /* * Perform sanity checks of tunables declared above. */ static void tunable_mbinit(void *dummy) { /* * This has to be done before VM init. */ nmbclusters = NMBCLUSTERS; TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); nmbufs = NMBUFS; TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); nsfbufs = NSFBUFS; TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); nmbcnt = NMBCNTS; TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt); /* Sanity checks */ if (nmbufs < nmbclusters * 2) nmbufs = nmbclusters * 2; if (nmbcnt < nmbclusters + nsfbufs) nmbcnt = nmbclusters + nsfbufs; } SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL); /* * The freelist structures and mutex locks. The number statically declared * here depends on the number of CPUs. * * We set up in such a way that all the objects (mbufs, clusters) * share the same mutex lock. It has been established that we do not benefit * from different locks for different objects, so we use the same lock, * regardless of object type. This also allows us to do optimised * multi-object allocations without dropping the lock in between. */ struct mb_lstmngr { struct mb_gen_list *ml_genlist; struct mb_pcpu_list *ml_cntlst[NCPU]; struct mb_bucket **ml_btable; vm_map_t ml_map; vm_offset_t ml_mapbase; vm_offset_t ml_maptop; int ml_mapfull; u_int ml_objsize; u_int ml_objbucks; u_int *ml_wmhigh; u_int *ml_wmlow; }; static struct mb_lstmngr mb_list_mbuf, mb_list_clust; static struct mtx mbuf_gen, mbuf_pcpu[NCPU]; u_int *cl_refcntmap; /* * Local macros for internal allocator structure manipulations. */ #ifdef SMP #define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[PCPU_GET(cpuid)] #else #define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[0] #endif #define MB_GET_GEN_LIST(mb_lst) (mb_lst)->ml_genlist #define MB_LOCK_CONT(mb_cnt) mtx_lock((mb_cnt)->mb_cont.mc_lock) #define MB_UNLOCK_CONT(mb_cnt) mtx_unlock((mb_cnt)->mb_cont.mc_lock) #define MB_GET_PCPU_LIST_NUM(mb_lst, num) \ (mb_lst)->ml_cntlst[(num)] #define MB_BUCKET_INDX(mb_obj, mb_lst) \ (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / \ ((mb_lst)->ml_objbucks * (mb_lst)->ml_objsize)) #define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \ { \ struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead); \ \ (mb_bckt)->mb_numfree--; \ (mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)]; \ (*((mb_lst)->mb_cont.mc_objcount))--; \ if ((mb_bckt)->mb_numfree == 0) { \ SLIST_REMOVE_HEAD(_mchd, mb_blist); \ SLIST_NEXT((mb_bckt), mb_blist) = NULL; \ (mb_bckt)->mb_owner |= MB_BUCKET_FREE; \ } \ } #define MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst) \ (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp); \ (mb_bckt)->mb_numfree++; \ (*((mb_lst)->mb_cont.mc_objcount))++; #define MB_MBTYPES_INC(mb_cnt, mb_type, mb_num) \ if ((mb_type) != MT_NOTMBUF) \ (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num) #define MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num) \ if ((mb_type) != MT_NOTMBUF) \ (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num) /* * Ownership of buckets/containers is represented by integers. The PCPU * lists range from 0 to NCPU-1. We need a free numerical id for the general * list (we use NCPU). We also need a non-conflicting free bit to indicate * that the bucket is free and removed from a container, while not losing * the bucket's originating container id. We use the highest bit * for the free marker. */ #define MB_GENLIST_OWNER (NCPU) #define MB_BUCKET_FREE (1 << (sizeof(int) * 8 - 1)) /* Statistics structures for allocator (per-CPU and general). */ static struct mbpstat mb_statpcpu[NCPU + 1]; struct mbstat mbstat; /* Sleep time for wait code (in ticks). */ static int mbuf_wait = 64; static u_int mbuf_hiwm = 512; /* High wm on # of mbufs per cache */ static u_int mbuf_lowm = 128; /* Low wm on # of mbufs per cache */ static u_int clust_hiwm = 128; /* High wm on # of clusters per cache */ static u_int clust_lowm = 16; /* Low wm on # of clusters per cache */ /* * Objects exported by sysctl(8). */ SYSCTL_DECL(_kern_ipc); SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, "Maximum number of mbuf clusters available"); SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0, "Maximum number of mbufs available"); SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0, "Number used to scale kmem_map to ensure sufficient space for counters"); SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RD, &nsfbufs, 0, "Maximum number of sendfile(2) sf_bufs available"); SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0, "Sleep time of mbuf subsystem wait allocations during exhaustion"); SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_hiwm, CTLFLAG_RW, &mbuf_hiwm, 0, "Upper limit of number of mbufs allowed in each cache"); SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_lowm, CTLFLAG_RW, &mbuf_lowm, 0, "Lower limit of number of mbufs allowed in each cache"); SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_hiwm, CTLFLAG_RW, &clust_hiwm, 0, "Upper limit of number of mbuf clusters allowed in each cache"); SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_lowm, CTLFLAG_RW, &clust_lowm, 0, "Lower limit of number of mbuf clusters allowed in each cache"); SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat, "Mbuf general information and statistics"); SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu, sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics"); /* * Prototypes of local allocator routines. */ static void *mb_alloc_wait(struct mb_lstmngr *, short); static struct mb_bucket *mb_pop_cont(struct mb_lstmngr *, int, struct mb_pcpu_list *); static void mb_reclaim(void); static void mbuf_init(void *); /* * Initial allocation numbers. Each parameter represents the number of buckets * of each object that will be placed initially in each PCPU container for * said object. */ #define NMB_MBUF_INIT 2 #define NMB_CLUST_INIT 8 /* * Internal flags that allow for cache locks to remain "persistent" across * allocation and free calls. They may be used in combination. */ #define MBP_PERSIST 0x1 /* Return with lock still held. */ #define MBP_PERSISTENT 0x2 /* Cache lock is already held coming in. */ /* * Initialize the mbuf subsystem. * * We sub-divide the kmem_map into several submaps; this way, we don't have * to worry about artificially limiting the number of mbuf or mbuf cluster * allocations, due to fear of one type of allocation "stealing" address * space initially reserved for another. * * Set up both the general containers and all the PCPU containers. Populate * the PCPU containers with initial numbers. */ MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures"); SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL) static void mbuf_init(void *dummy) { struct mb_pcpu_list *pcpu_cnt; vm_size_t mb_map_size; int i, j; /* * Set up all the submaps, for each type of object that we deal * with in this allocator. */ mb_map_size = (vm_size_t)(nmbufs * MSIZE); mb_map_size = rounddown(mb_map_size, MBUF_BUCK_SZ); mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / MBUF_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT); if (mb_list_mbuf.ml_btable == NULL) goto bad; mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase), &(mb_list_mbuf.ml_maptop), mb_map_size); mb_list_mbuf.ml_map->system_map = 1; mb_list_mbuf.ml_mapfull = 0; mb_list_mbuf.ml_objsize = MSIZE; mb_list_mbuf.ml_objbucks = MBUF_BUCK_SZ / MSIZE; mb_list_mbuf.ml_wmhigh = &mbuf_hiwm; mb_list_mbuf.ml_wmlow = &mbuf_lowm; mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES); mb_map_size = rounddown(mb_map_size, CLUST_BUCK_SZ); mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / CLUST_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT); if (mb_list_clust.ml_btable == NULL) goto bad; mb_list_clust.ml_map = kmem_suballoc(kmem_map, &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop), mb_map_size); mb_list_clust.ml_map->system_map = 1; mb_list_clust.ml_mapfull = 0; mb_list_clust.ml_objsize = MCLBYTES; mb_list_clust.ml_objbucks = CLUST_BUCK_SZ / MCLBYTES; mb_list_clust.ml_wmhigh = &clust_hiwm; mb_list_clust.ml_wmlow = &clust_lowm; /* * Allocate required general (global) containers for each object type. */ mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF, M_NOWAIT); mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF, M_NOWAIT); if ((mb_list_mbuf.ml_genlist == NULL) || (mb_list_clust.ml_genlist == NULL)) goto bad; /* * Initialize condition variables and general container mutex locks. */ mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", NULL, 0); cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved"); cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved), "mcluster pool starved"); mb_list_mbuf.ml_genlist->mb_cont.mc_lock = mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen; /* * Set up the general containers for each object. */ mb_list_mbuf.ml_genlist->mb_cont.mc_numowner = mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER; mb_list_mbuf.ml_genlist->mb_cont.mc_starved = mb_list_clust.ml_genlist->mb_cont.mc_starved = 0; mb_list_mbuf.ml_genlist->mb_cont.mc_objcount = &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree); mb_list_clust.ml_genlist->mb_cont.mc_objcount = &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree); mb_list_mbuf.ml_genlist->mb_cont.mc_numbucks = &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbbucks); mb_list_clust.ml_genlist->mb_cont.mc_numbucks = &(mb_statpcpu[MB_GENLIST_OWNER].mb_clbucks); mb_list_mbuf.ml_genlist->mb_cont.mc_types = &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]); mb_list_clust.ml_genlist->mb_cont.mc_types = NULL; SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead)); SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead)); /* * Allocate all the required counters for clusters. This makes * cluster allocations/deallocations much faster. */ cl_refcntmap = malloc(nmbclusters * sizeof(u_int), M_MBUF, M_NOWAIT); if (cl_refcntmap == NULL) goto bad; /* * Initialize general mbuf statistics. */ mbstat.m_msize = MSIZE; mbstat.m_mclbytes = MCLBYTES; mbstat.m_minclsize = MINCLSIZE; mbstat.m_mlen = MLEN; mbstat.m_mhlen = MHLEN; mbstat.m_numtypes = MT_NTYPES; mbstat.m_mbperbuck = MBUF_BUCK_SZ / MSIZE; mbstat.m_clperbuck = CLUST_BUCK_SZ / MCLBYTES; /* * Allocate and initialize PCPU containers. */ for (i = 0; i < NCPU; i++) { if (CPU_ABSENT(i)) { mb_statpcpu[i].mb_active = 0; continue; } mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list), M_MBUF, M_NOWAIT); mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list), M_MBUF, M_NOWAIT); if ((mb_list_mbuf.ml_cntlst[i] == NULL) || (mb_list_clust.ml_cntlst[i] == NULL)) goto bad; mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", NULL, 0); mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock = mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i]; mb_statpcpu[i].mb_active = 1; mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner = mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i; mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved = mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0; mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount = &(mb_statpcpu[i].mb_mbfree); mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount = &(mb_statpcpu[i].mb_clfree); mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numbucks = &(mb_statpcpu[i].mb_mbbucks); mb_list_clust.ml_cntlst[i]->mb_cont.mc_numbucks = &(mb_statpcpu[i].mb_clbucks); mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types = &(mb_statpcpu[i].mb_mbtypes[0]); mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL; SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead)); SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead)); /* * Perform initial allocations. */ pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i); MB_LOCK_CONT(pcpu_cnt); for (j = 0; j < NMB_MBUF_INIT; j++) { if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt) == NULL) goto bad; } MB_UNLOCK_CONT(pcpu_cnt); pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i); MB_LOCK_CONT(pcpu_cnt); for (j = 0; j < NMB_CLUST_INIT; j++) { if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt) == NULL) goto bad; } MB_UNLOCK_CONT(pcpu_cnt); } return; bad: panic("mbuf_init(): failed to initialize mbuf subsystem!"); } /* * Populate a given mbuf PCPU container with a bucket full of fresh new * buffers. Return a pointer to the new bucket (already in the container if * successful), or return NULL on failure. * * LOCKING NOTES: * PCPU container lock must be held when this is called. * The lock is dropped here so that we can cleanly call the underlying VM * code. If we fail, we return with no locks held. If we succeed (i.e., return * non-NULL), we return with the PCPU lock held, ready for allocation from * the returned bucket. */ static struct mb_bucket * mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst) { struct mb_bucket *bucket; caddr_t p; int i; MB_UNLOCK_CONT(cnt_lst); /* * If our object's (finite) map is starved now (i.e., no more address * space), bail out now. */ if (mb_list->ml_mapfull) return (NULL); bucket = malloc(sizeof(struct mb_bucket) + - mb_list->ml_objbucks * sizeof(void *), M_MBUF, - how == M_TRYWAIT ? M_WAITOK : M_NOWAIT); + mb_list->ml_objbucks * sizeof(void *), M_MBUF, MBTOM(how)); if (bucket == NULL) return (NULL); p = (caddr_t)kmem_malloc(mb_list->ml_map, mb_list->ml_objsize * - mb_list->ml_objbucks, how == M_TRYWAIT ? M_WAITOK : M_NOWAIT); + mb_list->ml_objbucks, MBTOM(how)); if (p == NULL) { free(bucket, M_MBUF); if (how == M_TRYWAIT) mb_list->ml_mapfull = 1; return (NULL); } bucket->mb_numfree = 0; mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket; for (i = 0; i < mb_list->ml_objbucks; i++) { bucket->mb_free[i] = p; bucket->mb_numfree++; p += mb_list->ml_objsize; } MB_LOCK_CONT(cnt_lst); bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist); (*(cnt_lst->mb_cont.mc_numbucks))++; *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree; return (bucket); } /* * Allocate a network buffer. * The general case is very easy. Complications only arise if our PCPU * container is empty. Things get worse if the PCPU container is empty, * the general container is empty, and we've run out of address space * in our map; then we try to block if we're willing to (M_TRYWAIT). */ static __inline void * mb_alloc(struct mb_lstmngr *mb_list, int how, short type, short persist, int *pers_list) { static int last_report; struct mb_pcpu_list *cnt_lst; struct mb_bucket *bucket; void *m; #ifdef INVARIANTS int flags; flags = how & (M_WAITOK | M_NOWAIT | M_DONTWAIT | M_TRYWAIT); if (flags != M_DONTWAIT && flags != M_TRYWAIT) { static struct timeval lasterr; static int curerr; if (ppsratecheck(&lasterr, &curerr, 1)) { printf("Bad mbuf alloc flags: %x\n", flags); backtrace(); how = M_TRYWAIT; } } #endif m = NULL; if ((persist & MBP_PERSISTENT) != 0) { /* * If we're a "persistent" call, then the per-CPU #(pers_list) * cache lock is already held, and we just need to refer to * the correct cache descriptor. */ cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list); } else { cnt_lst = MB_GET_PCPU_LIST(mb_list); MB_LOCK_CONT(cnt_lst); } if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) { /* * This is the easy allocation case. We just grab an object * from a bucket in the PCPU container. At worst, we * have just emptied the bucket and so we remove it * from the container. */ MB_GET_OBJECT(m, bucket, cnt_lst); MB_MBTYPES_INC(cnt_lst, type, 1); /* If asked to persist, do not drop the lock. */ if ((persist & MBP_PERSIST) == 0) MB_UNLOCK_CONT(cnt_lst); else *pers_list = cnt_lst->mb_cont.mc_numowner; } else { struct mb_gen_list *gen_list; /* * This is the less-common more difficult case. We must * first verify if the general list has anything for us * and if that also fails, we must allocate a page from * the map and create a new bucket to place in our PCPU * container (already locked). If the map is starved then * we're really in for trouble, as we have to wait on * the general container's condition variable. */ gen_list = MB_GET_GEN_LIST(mb_list); MB_LOCK_CONT(gen_list); if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) { /* * Give ownership of the bucket to our CPU's * container, but only actually put the bucket * in the container if it doesn't become free * upon removing an mbuf from it. */ SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead), mb_blist); bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; (*(gen_list->mb_cont.mc_numbucks))--; (*(cnt_lst->mb_cont.mc_numbucks))++; *(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree; bucket->mb_numfree--; m = bucket->mb_free[(bucket->mb_numfree)]; if (bucket->mb_numfree == 0) { SLIST_NEXT(bucket, mb_blist) = NULL; bucket->mb_owner |= MB_BUCKET_FREE; } else { SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist); *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree; } MB_UNLOCK_CONT(gen_list); MB_MBTYPES_INC(cnt_lst, type, 1); /* If asked to persist, do not drop the lock. */ if ((persist & MBP_PERSIST) == 0) MB_UNLOCK_CONT(cnt_lst); else *pers_list = cnt_lst->mb_cont.mc_numowner; } else { /* * We'll have to allocate a new page. */ MB_UNLOCK_CONT(gen_list); bucket = mb_pop_cont(mb_list, how, cnt_lst); if (bucket != NULL) { MB_GET_OBJECT(m, bucket, cnt_lst); MB_MBTYPES_INC(cnt_lst, type, 1); /* If asked to persist, do not drop the lock. */ if ((persist & MBP_PERSIST) == 0) MB_UNLOCK_CONT(cnt_lst); else *pers_list=cnt_lst->mb_cont.mc_numowner; } else { if (how == M_TRYWAIT) { /* * Absolute worst-case scenario. * We block if we're willing to, but * only after trying to steal from * other lists. */ m = mb_alloc_wait(mb_list, type); } else { /* XXX: No consistency. */ mbstat.m_drops++; if (ticks < last_report || (ticks - last_report) >= hz) { last_report = ticks; printf( "All mbufs or mbuf clusters exhausted, please see tuning(7).\n"); } } if (m != NULL && (persist & MBP_PERSIST) != 0) { cnt_lst = MB_GET_PCPU_LIST(mb_list); MB_LOCK_CONT(cnt_lst); *pers_list=cnt_lst->mb_cont.mc_numowner; } } } } return (m); } /* * This is the worst-case scenario called only if we're allocating with * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf * by looking in every PCPU container. If we're still unsuccesful, we * try the general container one last time and possibly block on our * starved cv. */ static void * mb_alloc_wait(struct mb_lstmngr *mb_list, short type) { struct mb_pcpu_list *cnt_lst; struct mb_gen_list *gen_list; struct mb_bucket *bucket; void *m; int i, cv_ret; /* * Try to reclaim mbuf-related objects (mbufs, clusters). */ mb_reclaim(); /* * Cycle all the PCPU containers. Increment starved counts if found * empty. */ for (i = 0; i < NCPU; i++) { if (CPU_ABSENT(i)) continue; cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i); MB_LOCK_CONT(cnt_lst); /* * If container is non-empty, get a single object from it. * If empty, increment starved count. */ if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) { MB_GET_OBJECT(m, bucket, cnt_lst); MB_MBTYPES_INC(cnt_lst, type, 1); MB_UNLOCK_CONT(cnt_lst); mbstat.m_wait++; /* XXX: No consistency. */ return (m); } else cnt_lst->mb_cont.mc_starved++; MB_UNLOCK_CONT(cnt_lst); } /* * We're still here, so that means it's time to get the general * container lock, check it one more time (now that mb_reclaim() * has been called) and if we still get nothing, block on the cv. */ gen_list = MB_GET_GEN_LIST(mb_list); MB_LOCK_CONT(gen_list); if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) { MB_GET_OBJECT(m, bucket, gen_list); MB_MBTYPES_INC(gen_list, type, 1); MB_UNLOCK_CONT(gen_list); mbstat.m_wait++; /* XXX: No consistency. */ return (m); } gen_list->mb_cont.mc_starved++; cv_ret = cv_timedwait(&(gen_list->mgl_mstarved), gen_list->mb_cont.mc_lock, mbuf_wait); gen_list->mb_cont.mc_starved--; if ((cv_ret == 0) && ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) { MB_GET_OBJECT(m, bucket, gen_list); MB_MBTYPES_INC(gen_list, type, 1); mbstat.m_wait++; /* XXX: No consistency. */ } else { mbstat.m_drops++; /* XXX: No consistency. */ m = NULL; } MB_UNLOCK_CONT(gen_list); return (m); } /*- * Free an object to its rightful container. * In the very general case, this operation is really very easy. * Complications arise primarily if: * (a) We've hit the high limit on number of free objects allowed in * our PCPU container. * (b) We're in a critical situation where our container has been * marked 'starved' and we need to issue wakeups on the starved * condition variable. * (c) Minor (odd) cases: our bucket has migrated while we were * waiting for the lock; our bucket is in the general container; * our bucket is empty. */ static __inline void mb_free(struct mb_lstmngr *mb_list, void *m, short type, short persist, int *pers_list) { struct mb_pcpu_list *cnt_lst; struct mb_gen_list *gen_list; struct mb_bucket *bucket; u_int owner; bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)]; /* * Make sure that if after we lock the bucket's present container the * bucket has migrated, that we drop the lock and get the new one. */ retry_lock: owner = bucket->mb_owner & ~MB_BUCKET_FREE; switch (owner) { case MB_GENLIST_OWNER: gen_list = MB_GET_GEN_LIST(mb_list); if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) { if (*pers_list != MB_GENLIST_OWNER) { cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list); MB_UNLOCK_CONT(cnt_lst); MB_LOCK_CONT(gen_list); } } else { MB_LOCK_CONT(gen_list); } if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) { MB_UNLOCK_CONT(gen_list); *pers_list = -1; goto retry_lock; } /* * If we're intended for the general container, this is * real easy: no migrating required. The only `bogon' * is that we're now contending with all the threads * dealing with the general list, but this is expected. */ MB_PUT_OBJECT(m, bucket, gen_list); MB_MBTYPES_DEC(gen_list, type, 1); if (gen_list->mb_cont.mc_starved > 0) cv_signal(&(gen_list->mgl_mstarved)); if ((persist & MBP_PERSIST) == 0) MB_UNLOCK_CONT(gen_list); else *pers_list = MB_GENLIST_OWNER; break; default: cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner); if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) { if (*pers_list == MB_GENLIST_OWNER) { gen_list = MB_GET_GEN_LIST(mb_list); MB_UNLOCK_CONT(gen_list); MB_LOCK_CONT(cnt_lst); } else { cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list); owner = *pers_list; } } else { MB_LOCK_CONT(cnt_lst); } if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) { MB_UNLOCK_CONT(cnt_lst); *pers_list = -1; goto retry_lock; } MB_PUT_OBJECT(m, bucket, cnt_lst); MB_MBTYPES_DEC(cnt_lst, type, 1); if (cnt_lst->mb_cont.mc_starved > 0) { /* * This is a tough case. It means that we've * been flagged at least once to indicate that * we're empty, and that the system is in a critical * situation, so we ought to migrate at least one * bucket over to the general container. * There may or may not be a thread blocking on * the starved condition variable, but chances * are that one will eventually come up soon so * it's better to migrate now than never. */ gen_list = MB_GET_GEN_LIST(mb_list); MB_LOCK_CONT(gen_list); KASSERT((bucket->mb_owner & MB_BUCKET_FREE) != 0, ("mb_free: corrupt bucket %p\n", bucket)); SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead), bucket, mb_blist); bucket->mb_owner = MB_GENLIST_OWNER; (*(cnt_lst->mb_cont.mc_objcount))--; (*(gen_list->mb_cont.mc_objcount))++; (*(cnt_lst->mb_cont.mc_numbucks))--; (*(gen_list->mb_cont.mc_numbucks))++; /* * Determine whether or not to keep transferring * buckets to the general list or whether we've * transferred enough already. * We realize that although we may flag another * bucket to be migrated to the general container * that in the meantime, the thread that was * blocked on the cv is already woken up and * long gone. But in that case, the worst * consequence is that we will end up migrating * one bucket too many, which is really not a big * deal, especially if we're close to a critical * situation. */ if (gen_list->mb_cont.mc_starved > 0) { cnt_lst->mb_cont.mc_starved--; cv_signal(&(gen_list->mgl_mstarved)); } else cnt_lst->mb_cont.mc_starved = 0; MB_UNLOCK_CONT(gen_list); if ((persist & MBP_PERSIST) == 0) MB_UNLOCK_CONT(cnt_lst); else *pers_list = owner; break; } if (*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) { /* * We've hit the high limit of allowed numbers of mbufs * on this PCPU list. We must now migrate a bucket * over to the general container. */ gen_list = MB_GET_GEN_LIST(mb_list); MB_LOCK_CONT(gen_list); if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) { bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead)); SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead), mb_blist); } SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead), bucket, mb_blist); bucket->mb_owner = MB_GENLIST_OWNER; *(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree; *(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree; (*(cnt_lst->mb_cont.mc_numbucks))--; (*(gen_list->mb_cont.mc_numbucks))++; /* * While we're at it, transfer some of the mbtypes * "count load" onto the general list's mbtypes * array, seeing as how we're moving the bucket * there now, meaning that the freeing of objects * there will now decrement the _general list's_ * mbtypes counters, and no longer our PCPU list's * mbtypes counters. We do this for the type presently * being freed in an effort to keep the mbtypes * counters approximately balanced across all lists. */ MB_MBTYPES_DEC(cnt_lst, type, mb_list->ml_objbucks - bucket->mb_numfree); MB_MBTYPES_INC(gen_list, type, mb_list->ml_objbucks - bucket->mb_numfree); MB_UNLOCK_CONT(gen_list); if ((persist & MBP_PERSIST) == 0) MB_UNLOCK_CONT(cnt_lst); else *pers_list = owner; break; } if (bucket->mb_owner & MB_BUCKET_FREE) { SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist); bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; } if ((persist & MBP_PERSIST) == 0) MB_UNLOCK_CONT(cnt_lst); else *pers_list = owner; break; } } /* * Drain protocols in hopes to free up some resources. * * LOCKING NOTES: * No locks should be held when this is called. The drain routines have to * presently acquire some locks which raises the possibility of lock order * violation if we're holding any mutex if that mutex is acquired in reverse * order relative to one of the locks in the drain routines. */ static void mb_reclaim(void) { struct domain *dp; struct protosw *pr; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, "mb_reclaim()"); mbstat.m_drain++; /* XXX: No consistency. */ for (dp = domains; dp != NULL; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_drain != NULL) (*pr->pr_drain)(); } /****************************************************************************** * Internal setup macros. */ #define _mb_setup(m, type) do { \ (m)->m_type = (type); \ (m)->m_next = NULL; \ (m)->m_nextpkt = NULL; \ (m)->m_data = (m)->m_dat; \ (m)->m_flags = 0; \ } while (0) #define _mbhdr_setup(m, type) do { \ (m)->m_type = (type); \ (m)->m_next = NULL; \ (m)->m_nextpkt = NULL; \ (m)->m_data = (m)->m_pktdat; \ (m)->m_flags = M_PKTHDR; \ (m)->m_pkthdr.rcvif = NULL; \ (m)->m_pkthdr.csum_flags = 0; \ SLIST_INIT(&(m)->m_pkthdr.tags); \ } while (0) #define _mcl_setup(m) do { \ (m)->m_data = (m)->m_ext.ext_buf; \ (m)->m_flags |= M_EXT; \ (m)->m_ext.ext_free = NULL; \ (m)->m_ext.ext_args = NULL; \ (m)->m_ext.ext_size = MCLBYTES; \ (m)->m_ext.ext_type = EXT_CLUSTER; \ } while (0) #define _mext_init_ref(m, ref) do { \ (m)->m_ext.ref_cnt = ((ref) == NULL) ? \ malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)(ref); \ if ((m)->m_ext.ref_cnt != NULL) { \ *((m)->m_ext.ref_cnt) = 0; \ MEXT_ADD_REF((m)); \ } \ } while (0) #define cl2ref(cl) \ (((uintptr_t)(cl) - (uintptr_t)mb_list_clust.ml_mapbase) >> MCLSHIFT) #define _mext_dealloc_ref(m) \ if ((m)->m_ext.ext_type != EXT_EXTREF) \ free((m)->m_ext.ref_cnt, M_MBUF) /****************************************************************************** * Internal routines. * * Because mb_alloc() and mb_free() are inlines (to keep the common * cases down to a maximum of one function call), below are a few * routines used only internally for the sole purpose of making certain * functions smaller. * * - _mext_free(): frees associated storage when the ref. count is * exactly one and we're freeing. * * - _mgetm_internal(): common "persistent-lock" routine that allocates * an mbuf and a cluster in one shot, but where the lock is already * held coming in (which is what makes it different from the exported * m_getcl()). The lock is dropped when done. This is used by m_getm() * and, therefore, is very m_getm()-specific. */ static struct mbuf *_mgetm_internal(int, short, short, int); void _mext_free(struct mbuf *mb) { if (mb->m_ext.ext_type == EXT_CLUSTER) { mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF, 0, NULL); } else { (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args); _mext_dealloc_ref(mb); } } static struct mbuf * _mgetm_internal(int how, short type, short persist, int cchnum) { struct mbuf *mb; mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, persist,&cchnum); if (mb == NULL) return NULL; _mb_setup(mb, type); if ((persist & MBP_PERSIST) != 0) { mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how, MT_NOTMBUF, MBP_PERSISTENT, &cchnum); if (mb->m_ext.ext_buf == NULL) { (void)m_free(mb); mb = NULL; } _mcl_setup(mb); _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); } return (mb); } /****************************************************************************** * Exported buffer allocation and de-allocation routines. */ /* * Allocate and return a single (normal) mbuf. NULL is returned on failure. * * Arguments: * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks * if really starved for memory. M_DONTWAIT to never block. * - type: the type of the mbuf being allocated. */ struct mbuf * m_get(int how, short type) { struct mbuf *mb; mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); if (mb != NULL) _mb_setup(mb, type); return (mb); } /* * Allocate a given length worth of mbufs and/or clusters (whatever fits * best) and return a pointer to the top of the allocated chain. If an * existing mbuf chain is provided, then we will append the new chain * to the existing one but still return the top of the newly allocated * chain. NULL is returned on failure, in which case the [optional] * provided chain is left untouched, and any memory already allocated * is freed. * * Arguments: * - m: existing chain to which to append new chain (optional). * - len: total length of data to append, either in mbufs or clusters * (we allocate whatever combination yields the best fit). * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks * if really starved for memory. M_DONTWAIT to never block. * - type: the type of the mbuf being allocated. */ struct mbuf * m_getm(struct mbuf *m, int len, int how, short type) { struct mbuf *mb, *top, *cur, *mtail; int num, rem, cchnum; short persist; int i; KASSERT(len >= 0, ("m_getm(): len is < 0")); /* If m != NULL, we will append to the end of that chain. */ if (m != NULL) for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next); else mtail = NULL; /* * In the best-case scenario (which should be the common case * unless we're in a starvation situation), we will be able to * go through the allocation of all the desired mbufs and clusters * here without dropping our per-CPU cache lock in between. */ num = len / MCLBYTES; rem = len % MCLBYTES; persist = 0; cchnum = -1; top = cur = NULL; for (i = 0; i < num; i++) { mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, MBP_PERSIST | persist, &cchnum); if (mb == NULL) goto failed; _mb_setup(mb, type); mb->m_len = 0; persist = (i != (num - 1) || rem > 0) ? MBP_PERSIST : 0; mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how, MT_NOTMBUF, persist | MBP_PERSISTENT, &cchnum); if (mb->m_ext.ext_buf == NULL) { (void)m_free(mb); goto failed; } _mcl_setup(mb); _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); persist = MBP_PERSISTENT; if (cur == NULL) top = cur = mb; else cur = (cur->m_next = mb); } if (rem > 0) { if (cchnum >= 0) { persist = MBP_PERSISTENT; persist |= (rem > MINCLSIZE) ? MBP_PERSIST : 0; mb = _mgetm_internal(how, type, persist, cchnum); if (mb == NULL) goto failed; } else if (rem > MINCLSIZE) { mb = m_getcl(how, type, 0); } else { mb = m_get(how, type); } if (mb != NULL) { mb->m_len = 0; if (cur == NULL) top = mb; else cur->m_next = mb; } else goto failed; } if (mtail != NULL) mtail->m_next = top; return top; failed: if (top != NULL) m_freem(top); return NULL; } /* * Allocate and return a single M_PKTHDR mbuf. NULL is returned on failure. * * Arguments: * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks * if really starved for memory. M_DONTWAIT to never block. * - type: the type of the mbuf being allocated. */ struct mbuf * m_gethdr(int how, short type) { struct mbuf *mb; mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); if (mb != NULL) { _mbhdr_setup(mb, type); #ifdef MAC if (mac_init_mbuf(mb, how) != 0) { m_free(mb); return NULL; } #endif } return (mb); } /* * Allocate and return a single (normal) pre-zero'd mbuf. NULL is * returned on failure. * * Arguments: * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks * if really starved for memory. M_DONTWAIT to never block. * - type: the type of the mbuf being allocated. */ struct mbuf * m_get_clrd(int how, short type) { struct mbuf *mb; mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); if (mb != NULL) { _mb_setup(mb, type); bzero(mtod(mb, caddr_t), MLEN); } return (mb); } /* * Allocate and return a single M_PKTHDR pre-zero'd mbuf. NULL is * returned on failure. * * Arguments: * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks * if really starved for memory. M_DONTWAIT to never block. * - type: the type of the mbuf being allocated. */ struct mbuf * m_gethdr_clrd(int how, short type) { struct mbuf *mb; mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); if (mb != NULL) { _mbhdr_setup(mb, type); #ifdef MAC if (mac_init_mbuf(mb, how) != 0) { m_free(mb); return NULL; } #endif bzero(mtod(mb, caddr_t), MHLEN); } return (mb); } /* * Free a single mbuf and any associated storage that it may have attached * to it. The associated storage may not be immediately freed if its * reference count is above 1. Returns the next mbuf in the chain following * the mbuf being freed. * * Arguments: * - mb: the mbuf to free. */ struct mbuf * m_free(struct mbuf *mb) { struct mbuf *nb; int cchnum; short persist = 0; if ((mb->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(mb, NULL); #ifdef MAC if ((mb->m_flags & M_PKTHDR) && (mb->m_pkthdr.label.l_flags & MAC_FLAG_INITIALIZED)) mac_destroy_mbuf(mb); #endif nb = mb->m_next; if ((mb->m_flags & M_EXT) != 0) { MEXT_REM_REF(mb); if (atomic_cmpset_int(mb->m_ext.ref_cnt, 0, 1)) { if (mb->m_ext.ext_type == EXT_CLUSTER) { mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF, MBP_PERSIST, &cchnum); persist = MBP_PERSISTENT; } else { (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args); _mext_dealloc_ref(mb); persist = 0; } } } mb_free(&mb_list_mbuf, mb, mb->m_type, persist, &cchnum); return (nb); } /* * Free an entire chain of mbufs and associated external buffers, if * applicable. Right now, we only optimize a little so that the cache * lock may be held across a single mbuf+cluster free. Hopefully, * we'll eventually be holding the lock across more than merely two * consecutive frees but right now this is hard to implement because of * things like _mext_dealloc_ref (may do a free()) and atomic ops in the * loop. * * - mb: the mbuf chain to free. */ void m_freem(struct mbuf *mb) { struct mbuf *m; int cchnum; short persist; while (mb != NULL) { if ((mb->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(mb, NULL); #ifdef MAC if ((mb->m_flags & M_PKTHDR) && (mb->m_pkthdr.label.l_flags & MAC_FLAG_INITIALIZED)) mac_destroy_mbuf(mb); #endif persist = 0; m = mb; mb = mb->m_next; if ((m->m_flags & M_EXT) != 0) { MEXT_REM_REF(m); if (atomic_cmpset_int(m->m_ext.ref_cnt, 0, 1)) { if (m->m_ext.ext_type == EXT_CLUSTER) { mb_free(&mb_list_clust, (caddr_t)m->m_ext.ext_buf, MT_NOTMBUF, MBP_PERSIST, &cchnum); persist = MBP_PERSISTENT; } else { (*(m->m_ext.ext_free))(m->m_ext.ext_buf, m->m_ext.ext_args); _mext_dealloc_ref(m); persist = 0; } } } mb_free(&mb_list_mbuf, m, m->m_type, persist, &cchnum); } } /* * Fetch an mbuf with a cluster attached to it. If one of the * allocations fails, the entire allocation fails. This routine is * the preferred way of fetching both the mbuf and cluster together, * as it avoids having to unlock/relock between allocations. Returns * NULL on failure. * * Arguments: * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks * if really starved for memory. M_DONTWAIT to never block. * - type: the type of the mbuf being allocated. * - flags: any flags to pass to the mbuf being allocated; if this includes * the M_PKTHDR bit, then the mbuf is configured as a M_PKTHDR mbuf. */ struct mbuf * m_getcl(int how, short type, int flags) { struct mbuf *mb; int cchnum; mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, MBP_PERSIST, &cchnum); if (mb == NULL) return NULL; mb->m_type = type; mb->m_next = NULL; mb->m_flags = flags; if ((flags & M_PKTHDR) != 0) { mb->m_nextpkt = NULL; mb->m_pkthdr.rcvif = NULL; mb->m_pkthdr.csum_flags = 0; SLIST_INIT(&mb->m_pkthdr.tags); } mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how, MT_NOTMBUF, MBP_PERSISTENT, &cchnum); if (mb->m_ext.ext_buf == NULL) { (void)m_free(mb); mb = NULL; } else { _mcl_setup(mb); _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); } #ifdef MAC if ((flags & M_PKTHDR) && (mac_init_mbuf(mb, how) != 0)) { m_free(mb); return NULL; } #endif return (mb); } /* * Fetch a single mbuf cluster and attach it to an existing mbuf. If * successfull, configures the provided mbuf to have mbuf->m_ext.ext_buf * pointing to the cluster, and sets the M_EXT bit in the mbuf's flags. * The M_EXT bit is not set on failure. * * Arguments: * - mb: the existing mbuf to which to attach the allocated cluster. * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks * if really starved for memory. M_DONTWAIT to never block. */ void m_clget(struct mbuf *mb, int how) { mb->m_ext.ext_buf= (caddr_t)mb_alloc(&mb_list_clust,how,MT_NOTMBUF, 0, NULL); if (mb->m_ext.ext_buf != NULL) { _mcl_setup(mb); _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); } } /* * Configure a provided mbuf to refer to the provided external storage * buffer and setup a reference count for said buffer. If the setting * up of the reference count fails, the M_EXT bit will not be set. If * successfull, the M_EXT bit is set in the mbuf's flags. * * Arguments: * - mb: the existing mbuf to which to attach the provided buffer. * - buf: the address of the provided external storage buffer. * - size: the size of the provided buffer. * - freef: a pointer to a routine that is responsible for freeing the * provided external storage buffer. * - args: a pointer to an argument structure (of any type) to be passed * to the provided freef routine (may be NULL). * - flags: any other flags to be passed to the provided mbuf. * - type: the type that the external storage buffer should be labeled with. */ void m_extadd(struct mbuf *mb, caddr_t buf, u_int size, void (*freef)(void *, void *), void *args, int flags, int type) { u_int *ref_cnt = NULL; if (type == EXT_CLUSTER) ref_cnt = &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]; else if (type == EXT_EXTREF) ref_cnt = mb->m_ext.ref_cnt; _mext_init_ref(mb, ref_cnt); if (mb->m_ext.ref_cnt != NULL) { mb->m_flags |= (M_EXT | flags); mb->m_ext.ext_buf = buf; mb->m_data = mb->m_ext.ext_buf; mb->m_ext.ext_size = size; mb->m_ext.ext_free = freef; mb->m_ext.ext_args = args; mb->m_ext.ext_type = type; } } /* * Change type of provided mbuf. This is a relatively expensive operation * (due to the cost of statistics manipulations) and should be avoided, where * possible. * * Arguments: * - mb: the provided mbuf for which the type needs to be changed. * - new_type: the new type to change the mbuf to. */ void m_chtype(struct mbuf *mb, short new_type) { struct mb_gen_list *gen_list; gen_list = MB_GET_GEN_LIST(&mb_list_mbuf); MB_LOCK_CONT(gen_list); MB_MBTYPES_DEC(gen_list, mb->m_type, 1); MB_MBTYPES_INC(gen_list, new_type, 1); MB_UNLOCK_CONT(gen_list); mb->m_type = new_type; } diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c index 9caeed9454e1..55d828df28ee 100644 --- a/sys/kern/uipc_mbuf.c +++ b/sys/kern/uipc_mbuf.c @@ -1,862 +1,861 @@ /* * Copyright (c) 1982, 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 * $FreeBSD$ */ #include "opt_mac.h" #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include int max_linkhdr; int max_protohdr; int max_hdr; int max_datalen; int m_defragpackets; int m_defragbytes; int m_defraguseless; int m_defragfailure; /* * sysctl(8) exported objects */ SYSCTL_DECL(_kern_ipc); SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW, &max_linkhdr, 0, ""); SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW, &max_protohdr, 0, ""); SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, ""); SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW, &max_datalen, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD, &m_defragpackets, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD, &m_defragbytes, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD, &m_defraguseless, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD, &m_defragfailure, 0, ""); /* * "Move" mbuf pkthdr from "from" to "to". * "from" must have M_PKTHDR set, and "to" must be empty. */ void m_move_pkthdr(struct mbuf *to, struct mbuf *from) { #if 0 /* see below for why these are not enabled */ M_ASSERTPKTHDR(to); KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_move_pkthdr: to has tags")); #endif KASSERT((to->m_flags & M_EXT) == 0, ("m_move_pkthdr: to has cluster")); #ifdef MAC if (to->m_flags & M_PKTHDR) mac_destroy_mbuf(to); #endif to->m_flags = from->m_flags & M_COPYFLAGS; to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; /* especially tags */ #ifdef MAC mac_init_mbuf(to, 1); /* XXXMAC no way to fail */ mac_create_mbuf_from_mbuf(from, to); #endif SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ from->m_flags &= ~M_PKTHDR; } /* * Duplicate "from"'s mbuf pkthdr in "to". * "from" must have M_PKTHDR set, and "to" must be empty. * In particular, this does a deep copy of the packet tags. */ int m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how) { #if 0 /* * The mbuf allocator only initializes the pkthdr * when the mbuf is allocated with MGETHDR. Many users * (e.g. m_copy*, m_prepend) use MGET and then * smash the pkthdr as needed causing these * assertions to trip. For now just disable them. */ M_ASSERTPKTHDR(to); KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags")); #endif #ifdef MAC if (to->m_flags & M_PKTHDR) mac_destroy_mbuf(to); #endif to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; #ifdef MAC mac_init_mbuf(to, 1); /* XXXMAC no way to fail */ mac_create_mbuf_from_mbuf(from, to); #endif SLIST_INIT(&to->m_pkthdr.tags); - return (m_tag_copy_chain(to, from, (how & M_TRYWAIT) ? M_WAITOK : - M_NOWAIT)); + return (m_tag_copy_chain(to, from, MBTOM(how))); } /* * Lesser-used path for M_PREPEND: * allocate new mbuf to prepend to chain, * copy junk along. */ struct mbuf * m_prepend(struct mbuf *m, int len, int how) { struct mbuf *mn; MGET(mn, how, m->m_type); if (mn == NULL) { m_freem(m); return (NULL); } if (m->m_flags & M_PKTHDR) { M_MOVE_PKTHDR(mn, m); #ifdef MAC mac_destroy_mbuf(m); #endif } mn->m_next = m; m = mn; if (len < MHLEN) MH_ALIGN(m, len); m->m_len = len; return (m); } /* * Make a copy of an mbuf chain starting "off0" bytes from the beginning, * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. * The wait parameter is a choice of M_TRYWAIT/M_DONTWAIT from caller. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. */ struct mbuf * m_copym(struct mbuf *m, int off0, int len, int wait) { struct mbuf *n, **np; int off = off0; struct mbuf *top; int copyhdr = 0; KASSERT(off >= 0, ("m_copym, negative off %d", off)); KASSERT(len >= 0, ("m_copym, negative len %d", len)); if (off == 0 && m->m_flags & M_PKTHDR) copyhdr = 1; while (off > 0) { KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } np = ⊤ top = 0; while (len > 0) { if (m == NULL) { KASSERT(len == M_COPYALL, ("m_copym, length > size of mbuf chain")); break; } MGET(n, wait, m->m_type); *np = n; if (n == NULL) goto nospace; if (copyhdr) { if (!m_dup_pkthdr(n, m, wait)) goto nospace; if (len == M_COPYALL) n->m_pkthdr.len -= off0; else n->m_pkthdr.len = len; copyhdr = 0; } n->m_len = min(len, m->m_len - off); if (m->m_flags & M_EXT) { n->m_data = m->m_data + off; n->m_ext = m->m_ext; n->m_flags |= M_EXT; MEXT_ADD_REF(m); } else bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), (u_int)n->m_len); if (len != M_COPYALL) len -= n->m_len; off = 0; m = m->m_next; np = &n->m_next; } if (top == NULL) mbstat.m_mcfail++; /* XXX: No consistency. */ return (top); nospace: m_freem(top); mbstat.m_mcfail++; /* XXX: No consistency. */ return (NULL); } /* * Copy an entire packet, including header (which must be present). * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. * Preserve alignment of the first mbuf so if the creator has left * some room at the beginning (e.g. for inserting protocol headers) * the copies still have the room available. */ struct mbuf * m_copypacket(struct mbuf *m, int how) { struct mbuf *top, *n, *o; MGET(n, how, m->m_type); top = n; if (n == NULL) goto nospace; if (!m_dup_pkthdr(n, m, how)) goto nospace; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; n->m_ext = m->m_ext; n->m_flags |= M_EXT; MEXT_ADD_REF(m); } else { n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; while (m) { MGET(o, how, m->m_type); if (o == NULL) goto nospace; n->m_next = o; n = n->m_next; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; n->m_ext = m->m_ext; n->m_flags |= M_EXT; MEXT_ADD_REF(m); } else { bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; } return top; nospace: m_freem(top); mbstat.m_mcfail++; /* XXX: No consistency. */ return (NULL); } /* * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. */ void m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) { u_int count; KASSERT(off >= 0, ("m_copydata, negative off %d", off)); KASSERT(len >= 0, ("m_copydata, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); count = min(m->m_len - off, len); bcopy(mtod(m, caddr_t) + off, cp, count); len -= count; cp += count; off = 0; m = m->m_next; } } /* * Copy a packet header mbuf chain into a completely new chain, including * copying any mbuf clusters. Use this instead of m_copypacket() when * you need a writable copy of an mbuf chain. */ struct mbuf * m_dup(struct mbuf *m, int how) { struct mbuf **p, *top = NULL; int remain, moff, nsize; /* Sanity check */ if (m == NULL) return (NULL); M_ASSERTPKTHDR(m); /* While there's more data, get a new mbuf, tack it on, and fill it */ remain = m->m_pkthdr.len; moff = 0; p = ⊤ while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ struct mbuf *n; /* Get the next new mbuf */ MGET(n, how, m->m_type); if (n == NULL) goto nospace; if (top == NULL) { /* first one, must be PKTHDR */ if (!m_dup_pkthdr(n, m, how)) goto nospace; nsize = MHLEN; } else /* not the first one */ nsize = MLEN; if (remain >= MINCLSIZE) { MCLGET(n, how); if ((n->m_flags & M_EXT) == 0) { (void)m_free(n); goto nospace; } nsize = MCLBYTES; } n->m_len = 0; /* Link it into the new chain */ *p = n; p = &n->m_next; /* Copy data from original mbuf(s) into new mbuf */ while (n->m_len < nsize && m != NULL) { int chunk = min(nsize - n->m_len, m->m_len - moff); bcopy(m->m_data + moff, n->m_data + n->m_len, chunk); moff += chunk; n->m_len += chunk; remain -= chunk; if (moff == m->m_len) { m = m->m_next; moff = 0; } } /* Check correct total mbuf length */ KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), ("%s: bogus m_pkthdr.len", __func__)); } return (top); nospace: m_freem(top); mbstat.m_mcfail++; /* XXX: No consistency. */ return (NULL); } /* * Concatenate mbuf chain n to m. * Both chains must be of the same type (e.g. MT_DATA). * Any m_pkthdr is not updated. */ void m_cat(struct mbuf *m, struct mbuf *n) { while (m->m_next) m = m->m_next; while (n) { if (m->m_flags & M_EXT || m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) { /* just join the two chains */ m->m_next = n; return; } /* splat the data from one into the other */ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)n->m_len); m->m_len += n->m_len; n = m_free(n); } } void m_adj(struct mbuf *mp, int req_len) { int len = req_len; struct mbuf *m; int count; if ((m = mp) == NULL) return; if (len >= 0) { /* * Trim from head. */ while (m != NULL && len > 0) { if (m->m_len <= len) { len -= m->m_len; m->m_len = 0; m = m->m_next; } else { m->m_len -= len; m->m_data += len; len = 0; } } m = mp; if (mp->m_flags & M_PKTHDR) m->m_pkthdr.len -= (req_len - len); } else { /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ len = -len; count = 0; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len >= len) { m->m_len -= len; if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= len; return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ m = mp; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len = count; for (; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; break; } count -= m->m_len; } while (m->m_next) (m = m->m_next) ->m_len = 0; } } /* * Rearange an mbuf chain so that len bytes are contiguous * and in the data area of an mbuf (so that mtod and dtom * will work for a structure of size len). Returns the resulting * mbuf chain on success, frees it and returns null on failure. * If there is room, it will add up to max_protohdr-len extra bytes to the * contiguous region in an attempt to avoid being called next time. */ struct mbuf * m_pullup(struct mbuf *n, int len) { struct mbuf *m; int count; int space; /* * If first mbuf has no cluster, and has room for len bytes * without shifting current data, pullup into it, * otherwise allocate a new mbuf to prepend to the chain. */ if ((n->m_flags & M_EXT) == 0 && n->m_data + len < &n->m_dat[MLEN] && n->m_next) { if (n->m_len >= len) return (n); m = n; n = n->m_next; len -= m->m_len; } else { if (len > MHLEN) goto bad; MGET(m, M_DONTWAIT, n->m_type); if (m == NULL) goto bad; m->m_len = 0; if (n->m_flags & M_PKTHDR) M_MOVE_PKTHDR(m, n); } space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); mbstat.m_mpfail++; /* XXX: No consistency. */ return (NULL); } /* * Partition an mbuf chain in two pieces, returning the tail -- * all but the first len0 bytes. In case of failure, it returns NULL and * attempts to restore the chain to its original state. * * Note that the resulting mbufs might be read-only, because the new * mbuf can end up sharing an mbuf cluster with the original mbuf if * the "breaking point" happens to lie within a cluster mbuf. Use the * M_WRITABLE() macro to check for this case. */ struct mbuf * m_split(struct mbuf *m0, int len0, int wait) { struct mbuf *m, *n; u_int len = len0, remain; for (m = m0; m && len > m->m_len; m = m->m_next) len -= m->m_len; if (m == NULL) return (NULL); remain = m->m_len - len; if (m0->m_flags & M_PKTHDR) { MGETHDR(n, wait, m0->m_type); if (n == NULL) return (NULL); n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; if (m->m_flags & M_EXT) goto extpacket; if (remain > MHLEN) { /* m can't be the lead packet */ MH_ALIGN(n, 0); n->m_next = m_split(m, len, wait); if (n->m_next == NULL) { (void) m_free(n); return (NULL); } else { n->m_len = 0; return (n); } } else MH_ALIGN(n, remain); } else if (remain == 0) { n = m->m_next; m->m_next = NULL; return (n); } else { MGET(n, wait, m->m_type); if (n == NULL) return (NULL); M_ALIGN(n, remain); } extpacket: if (m->m_flags & M_EXT) { n->m_flags |= M_EXT; n->m_ext = m->m_ext; MEXT_ADD_REF(m); n->m_data = m->m_data + len; } else { bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); } n->m_len = remain; m->m_len = len; n->m_next = m->m_next; m->m_next = NULL; return (n); } /* * Routine to copy from device local memory into mbufs. * Note that `off' argument is offset into first mbuf of target chain from * which to begin copying the data to. */ struct mbuf * m_devget(char *buf, int totlen, int off, struct ifnet *ifp, void (*copy)(char *from, caddr_t to, u_int len)) { struct mbuf *m; struct mbuf *top = 0, **mp = ⊤ int len; if (off < 0 || off > MHLEN) return (NULL); MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) return (NULL); m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = totlen; len = MHLEN; while (totlen > 0) { if (top) { MGET(m, M_DONTWAIT, MT_DATA); if (m == NULL) { m_freem(top); return (NULL); } len = MLEN; } if (totlen + off >= MINCLSIZE) { MCLGET(m, M_DONTWAIT); if (m->m_flags & M_EXT) len = MCLBYTES; } else { /* * Place initial small packet/header at end of mbuf. */ if (top == NULL && totlen + off + max_linkhdr <= len) { m->m_data += max_linkhdr; len -= max_linkhdr; } } if (off) { m->m_data += off; len -= off; off = 0; } m->m_len = len = min(totlen, len); if (copy) copy(buf, mtod(m, caddr_t), (u_int)len); else bcopy(buf, mtod(m, caddr_t), (u_int)len); buf += len; *mp = m; mp = &m->m_next; totlen -= len; } return (top); } /* * Copy data from a buffer back into the indicated mbuf chain, * starting "off" bytes from the beginning, extending the mbuf * chain if necessary. */ void m_copyback(struct mbuf *m0, int off, int len, caddr_t cp) { int mlen; struct mbuf *m = m0, *n; int totlen = 0; if (m0 == NULL) return; while (off > (mlen = m->m_len)) { off -= mlen; totlen += mlen; if (m->m_next == NULL) { n = m_get_clrd(M_DONTWAIT, m->m_type); if (n == NULL) goto out; n->m_len = min(MLEN, len + off); m->m_next = n; } m = m->m_next; } while (len > 0) { mlen = min (m->m_len - off, len); bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen); cp += mlen; len -= mlen; mlen += off; off = 0; totlen += mlen; if (len == 0) break; if (m->m_next == NULL) { n = m_get(M_DONTWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, len); m->m_next = n; } m = m->m_next; } out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) m->m_pkthdr.len = totlen; } void m_print(const struct mbuf *m) { int len; const struct mbuf *m2; len = m->m_pkthdr.len; m2 = m; while (len) { printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-"); len -= m2->m_len; m2 = m2->m_next; } return; } u_int m_fixhdr(struct mbuf *m0) { u_int len; len = m_length(m0, NULL); m0->m_pkthdr.len = len; return (len); } u_int m_length(struct mbuf *m0, struct mbuf **last) { struct mbuf *m; u_int len; len = 0; for (m = m0; m != NULL; m = m->m_next) { len += m->m_len; if (m->m_next == NULL) break; } if (last != NULL) *last = m; return (len); } /* * Defragment a mbuf chain, returning the shortest possible * chain of mbufs and clusters. If allocation fails and * this cannot be completed, NULL will be returned, but * the passed in chain will be unchanged. Upon success, * the original chain will be freed, and the new chain * will be returned. * * If a non-packet header is passed in, the original * mbuf (chain?) will be returned unharmed. */ struct mbuf * m_defrag(struct mbuf *m0, int how) { struct mbuf *m_new = NULL, *m_final = NULL; int progress = 0, length; if (!(m0->m_flags & M_PKTHDR)) return (m0); if (m0->m_pkthdr.len > MHLEN) m_final = m_getcl(how, MT_DATA, M_PKTHDR); else m_final = m_gethdr(how, MT_DATA); if (m_final == NULL) goto nospace; if (m_dup_pkthdr(m_final, m0, how) == NULL) goto nospace; m_new = m_final; while (progress < m0->m_pkthdr.len) { length = m0->m_pkthdr.len - progress; if (length > MCLBYTES) length = MCLBYTES; if (m_new == NULL) { if (length > MLEN) m_new = m_getcl(how, MT_DATA, 0); else m_new = m_get(how, MT_DATA); if (m_new == NULL) goto nospace; } m_copydata(m0, progress, length, mtod(m_new, caddr_t)); progress += length; m_new->m_len = length; if (m_new != m_final) m_cat(m_final, m_new); m_new = NULL; } if (m0->m_next == NULL) m_defraguseless++; m_freem(m0); m0 = m_final; m_defragpackets++; m_defragbytes += m0->m_pkthdr.len; return (m0); nospace: m_defragfailure++; if (m_new) m_free(m_new); if (m_final) m_freem(m_final); return (NULL); }