Index: fs/nfs/nfs_commonkrpc.c =================================================================== --- fs/nfs/nfs_commonkrpc.c +++ fs/nfs/nfs_commonkrpc.c @@ -96,6 +96,7 @@ extern void (*ncl_call_invalcaches)(struct vnode *); extern int nfs_numnfscbd; extern int nfscl_debuglevel; +extern int maxbcachebuf; SVCPOOL *nfscbd_pool; static int nfsrv_gsscallbackson = 0; @@ -161,7 +162,7 @@ struct ucred *cred, NFSPROC_T *p, int callback_retry_mult) { int rcvreserve, sndreserve; - int pktscale; + int pktscale, pktscalesav; struct sockaddr *saddr; struct ucred *origcred; CLIENT *client; @@ -210,6 +211,7 @@ pktscale = 2; if (pktscale > 64) pktscale = 64; + pktscalesav = pktscale; /* * soreserve() can fail if sb_max is too small, so shrink pktscale * and try again if there is an error. @@ -228,8 +230,12 @@ goto out; } do { - if (error != 0 && pktscale > 2) + if (error != 0 && pktscale > 2) { + if (nmp != NULL && nrp->nr_sotype == SOCK_STREAM && + pktscale == pktscalesav) + printf("Consider increasing kern.ipc.maxsockbuf\n"); pktscale--; + } if (nrp->nr_sotype == SOCK_DGRAM) { if (nmp != NULL) { sndreserve = (NFS_MAXDGRAMDATA + NFS_MAXPKTHDR) * @@ -243,15 +249,19 @@ if (nrp->nr_sotype != SOCK_STREAM) panic("nfscon sotype"); if (nmp != NULL) { - sndreserve = (NFS_MAXBSIZE + NFS_MAXPKTHDR + + sndreserve = (NFS_MAXBSIZE + NFS_MAXXDR + sizeof (u_int32_t)) * pktscale; - rcvreserve = (NFS_MAXBSIZE + NFS_MAXPKTHDR + + rcvreserve = (NFS_MAXBSIZE + NFS_MAXXDR + sizeof (u_int32_t)) * pktscale; } else { sndreserve = rcvreserve = 1024 * pktscale; } } error = soreserve(so, sndreserve, rcvreserve); + if (error != 0 && nmp != NULL && nrp->nr_sotype == SOCK_STREAM && + pktscale <= 2) + printf("Must increase kern.ipc.maxsockbuf or reduce" + " rsize, wsize\n"); } while (error != 0 && pktscale > 2); soclose(so); if (error) { Index: fs/nfs/nfsport.h =================================================================== --- fs/nfs/nfsport.h +++ fs/nfs/nfsport.h @@ -1028,7 +1028,7 @@ }; #ifndef NFS_MAXBSIZE -#define NFS_MAXBSIZE MAXBCACHEBUF +#define NFS_MAXBSIZE (maxbcachebuf) #endif /* Index: fs/nfs/nfsproto.h =================================================================== --- fs/nfs/nfsproto.h +++ fs/nfs/nfsproto.h @@ -56,8 +56,22 @@ #define NFS_MAXDGRAMDATA 16384 #define NFS_MAXPATHLEN 1024 #define NFS_MAXNAMLEN 255 +/* + * Calculating the maximum XDR overhead for an NFS RPC isn't easy. + * NFS_MAXPKTHDR is antiquated and assumes AUTH_SYS over UDP. + * NFS_MAXXDR should be sufficient for all NFS versions over TCP. + * It includes: + * - Maximum RPC message header. It can include 2 400byte authenticators plus + * a machine name of unlimited length, although it is usually relatively + * small. + * - XDR overheads for the NFSv4 compound. This can include Owner and + * Owner_group strings, which are usually fairly small, but are allowed + * to be up to 1024 bytes each. + * 4096 is overkill, but should always be sufficient. + */ #define NFS_MAXPKTHDR 404 -#define NFS_MAXPACKET (NFS_SRVMAXIO + 2048) +#define NFS_MAXXDR 4096 +#define NFS_MAXPACKET (NFS_SRVMAXIO + NFS_MAXXDR) #define NFS_MINPACKET 20 #define NFS_FABLKSIZE 512 /* Size in bytes of a block wrt fa_blocks */ #define NFSV4_MINORVERSION 0 /* V4 Minor version */ Index: fs/nfsclient/nfs_clrpcops.c =================================================================== --- fs/nfsclient/nfs_clrpcops.c +++ fs/nfsclient/nfs_clrpcops.c @@ -4703,7 +4703,7 @@ struct nfssockreq *nrp, uint32_t sequenceid, int mds, struct ucred *cred, NFSPROC_T *p) { - uint32_t crflags, *tl; + uint32_t crflags, maxval, *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; int error, irdcnt; @@ -4721,8 +4721,8 @@ /* Fill in fore channel attributes. */ NFSM_BUILD(tl, uint32_t *, 7 * NFSX_UNSIGNED); *tl++ = 0; /* Header pad size */ - *tl++ = txdr_unsigned(100000); /* Max request size */ - *tl++ = txdr_unsigned(100000); /* Max response size */ + *tl++ = txdr_unsigned(nmp->nm_wsize + NFS_MAXXDR);/* Max request size */ + *tl++ = txdr_unsigned(nmp->nm_rsize + NFS_MAXXDR);/* Max reply size */ *tl++ = txdr_unsigned(4096); /* Max response size cached */ *tl++ = txdr_unsigned(20); /* Max operations */ *tl++ = txdr_unsigned(64); /* Max slots */ @@ -4769,7 +4769,26 @@ /* Get the fore channel slot count. */ NFSM_DISSECT(tl, uint32_t *, 7 * NFSX_UNSIGNED); - tl += 3; /* Skip the other counts. */ + tl++; /* Skip the header pad size. */ + + /* Make sure nm_wsize is small enough. */ + maxval = fxdr_unsigned(uint32_t, *tl++); + while (maxval < nmp->nm_wsize + NFS_MAXXDR) { + if (nmp->nm_wsize > 8096) + nmp->nm_wsize /= 2; + else + break; + } + + /* Make sure nm_rsize is small enough. */ + maxval = fxdr_unsigned(uint32_t, *tl++); + while (maxval < nmp->nm_rsize + NFS_MAXXDR) { + if (nmp->nm_rsize > 8096) + nmp->nm_rsize /= 2; + else + break; + } + sep->nfsess_maxcache = fxdr_unsigned(int, *tl++); tl++; sep->nfsess_foreslots = fxdr_unsigned(uint16_t, *tl++); Index: fs/nfsclient/nfs_clvfsops.c =================================================================== --- fs/nfsclient/nfs_clvfsops.c +++ fs/nfsclient/nfs_clvfsops.c @@ -83,6 +83,7 @@ extern enum nfsiod_state ncl_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON]; extern struct mtx ncl_iod_mutex; +extern int maxbcachebuf; NFSCLSTATEMUTEX; MALLOC_DEFINE(M_NEWNFSREQ, "newnfsclient_req", "NFS request header"); Index: kern/vfs_bio.c =================================================================== --- kern/vfs_bio.c +++ kern/vfs_bio.c @@ -131,6 +131,7 @@ static void bufkva_free(struct buf *); static int buf_import(void *, void **, int, int); static void buf_release(void *, void **, int); +static void maxbcachebuf_adjust(void); #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) @@ -245,6 +246,10 @@ SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); +static int bkvasize = BKVASIZE; +int maxbcachebuf = MAXBCACHEBUF; +SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0, + "Maximum size of a buffer cache block"); /* * This lock synchronizes access to bd_request. @@ -847,6 +852,33 @@ } /* + * Adjust the maxbcachbuf and bkvasize tunables. + */ +static void +maxbcachebuf_adjust(void) +{ + int i; + + /* + * maxbcachebuf must be a power of 2 >= MAXBSIZE. + * If it has been tuned, set bkvasize to maxbcachebuf / 2. + */ + i = 2; + while (i * 2 <= maxbcachebuf) + i *= 2; + maxbcachebuf = i; + if (maxbcachebuf < MAXBSIZE) + maxbcachebuf = MAXBSIZE; + if (maxbcachebuf != MAXBCACHEBUF && maxbcachebuf > MAXBSIZE) { + bkvasize = maxbcachebuf / 2; + if (bkvasize < BKVASIZE) + bkvasize = BKVASIZE; + } + if (maxbcachebuf != MAXBCACHEBUF || bkvasize != BKVASIZE) + printf("maxbcachebuf=%d bkvasize=%d\n", maxbcachebuf, bkvasize); +} + +/* * bd_speedup - speedup the buffer cache flushing code */ void @@ -893,8 +925,9 @@ */ physmem_est = physmem_est * (PAGE_SIZE / 1024); + maxbcachebuf_adjust(); /* - * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. + * The nominal buffer size (and minimum KVA allocation) is bkvasize. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing @@ -904,7 +937,7 @@ * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { - int factor = 4 * BKVASIZE / 1024; + int factor = 4 * bkvasize / 1024; nbuf = 50; if (physmem_est > 4096) @@ -914,14 +947,14 @@ nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); - if (maxbcache && nbuf > maxbcache / BKVASIZE) - nbuf = maxbcache / BKVASIZE; + if (maxbcache && nbuf > maxbcache / bkvasize) + nbuf = maxbcache / bkvasize; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ - maxbuf = (LONG_MAX / 3) / BKVASIZE; + maxbuf = (LONG_MAX / 3) / bkvasize; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, @@ -943,8 +976,8 @@ * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { - maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; - buf_sz = (long)nbuf * BKVASIZE; + maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * bkvasize; + buf_sz = (long)nbuf * bkvasize; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* @@ -973,7 +1006,7 @@ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) - nbuf = buf_sz / BKVASIZE; + nbuf = buf_sz / bkvasize; } /* @@ -1003,7 +1036,6 @@ struct buf *bp; int i; - CTASSERT(MAXBCACHEBUF >= MAXBSIZE); mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF); mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF); for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++) @@ -1044,13 +1076,13 @@ * used by most other requests. The differential is required to * ensure that metadata deadlocks don't occur. * - * maxbufspace is based on BKVASIZE. Allocating buffers larger then + * maxbufspace is based on bkvasize. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. XXX This is less true with vmem. We could use * PAGE_SIZE. */ - maxbufspace = (long)nbuf * BKVASIZE; - hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10); + maxbufspace = (long)nbuf * bkvasize; + hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10); lobufspace = (hibufspace / 20) * 19; /* 95% */ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; @@ -1062,9 +1094,9 @@ * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ - hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF), + hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf), 16 * 1024 * 1024), 1024 * 1024); - lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF); + lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf); /* * Limit the amount of malloc memory since it is wired permanently into @@ -1086,9 +1118,9 @@ * To support extreme low-memory systems, make sure hidirtybuffers * cannot eat up all available buffer space. This occurs when our * minimum cannot be met. We try to size hidirtybuffers to 3/4 our - * buffer space assuming BKVASIZE'd buffers. + * buffer space assuming bkvasize'd buffers. */ - while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { + while ((long)hidirtybuffers * bkvasize > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; @@ -2887,7 +2919,7 @@ if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { /* * In order to keep fragmentation sane we only allocate kva - * in BKVASIZE chunks. XXX with vmem we can do page size. + * in bkvasize chunks. XXX with vmem we can do page size. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; @@ -3484,9 +3516,9 @@ KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); - if (size > MAXBCACHEBUF) - panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size, - MAXBCACHEBUF); + if (size > maxbcachebuf) + panic("getblk: size(%d) > maxbcachebuf(%d)\n", size, + maxbcachebuf); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC);