Index: sys/dev/ahci/ahci.h =================================================================== --- sys/dev/ahci/ahci.h +++ sys/dev/ahci/ahci.h @@ -310,13 +310,8 @@ #define AHCI_P_DEVSLP_DM 0x0e000000 #define AHCI_P_DEVSLP_DM_SHIFT 25 -/* Just to be sure, if building as module. */ -#if MAXPHYS < 512 * 1024 -#undef MAXPHYS -#define MAXPHYS 512 * 1024 -#endif /* Pessimistic prognosis on number of required S/G entries */ -#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8)) +#define AHCI_SG_ENTRIES (roundup(btoc(maxphys) + 1, 8)) /* Command list. 32 commands. First, 1Kbyte aligned. */ #define AHCI_CL_OFFSET 0 #define AHCI_CL_SIZE 32 @@ -344,7 +339,7 @@ u_int8_t cfis[64]; u_int8_t acmd[32]; u_int8_t reserved[32]; - struct ahci_dma_prd prd_tab[AHCI_SG_ENTRIES]; + struct ahci_dma_prd prd_tab[]; } __packed; struct ahci_cmd_list { Index: sys/dev/ahci/ahci.c =================================================================== --- sys/dev/ahci/ahci.c +++ sys/dev/ahci/ahci.c @@ -2868,7 +2868,7 @@ cpi->transport_version = XPORT_VERSION_UNSPECIFIED; cpi->protocol = PROTO_ATA; cpi->protocol_version = PROTO_VERSION_UNSPECIFIED; - cpi->maxio = MAXPHYS; + cpi->maxio = maxphys; /* ATI SB600 can't handle 256 sectors with FPDMA (NCQ). */ if (ch->quirks & AHCI_Q_MAXIO_64K) cpi->maxio = min(cpi->maxio, 128 * 512); Index: sys/dev/md/md.c =================================================================== --- sys/dev/md/md.c +++ sys/dev/md/md.c @@ -960,9 +960,10 @@ piov = auio.uio_iov; } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { pb = uma_zalloc(md_pbuf_zone, M_WAITOK); + MPASS((pb->b_flags & B_MAXPHYS) != 0); bp->bio_resid = len; unmapped_step: - npages = atop(min(MAXPHYS, round_page(len + (ma_offs & + npages = atop(min(maxphys, round_page(len + (ma_offs & PAGE_MASK)))); iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len); KASSERT(iolen > 0, ("zero iolen")); @@ -1684,7 +1685,7 @@ sectsize = DEV_BSIZE; else sectsize = mdr->md_sectorsize; - if (sectsize > MAXPHYS || mdr->md_mediasize < sectsize) + if (sectsize > maxphys || mdr->md_mediasize < sectsize) return (EINVAL); if (mdr->md_options & MD_AUTOUNIT) sc = mdnew(-1, &error, mdr->md_type); Index: sys/dev/siis/siis.h =================================================================== --- sys/dev/siis/siis.h +++ sys/dev/siis/siis.h @@ -263,13 +263,8 @@ #define SIIS_OFFSET 0x100 #define SIIS_STEP 0x80 -/* Just to be sure, if building as module. */ -#if MAXPHYS < 512 * 1024 -#undef MAXPHYS -#define MAXPHYS 512 * 1024 -#endif /* Pessimistic prognosis on number of required S/G entries */ -#define SIIS_SG_ENTRIES (roundup(btoc(MAXPHYS), 4) + 1) +#define SIIS_SG_ENTRIES (roundup(btoc(maxphys), 4) + 1) /* Command tables. Up to 32 commands, Each, 128byte aligned. */ #define SIIS_CT_OFFSET 0 #define SIIS_CT_SIZE (32 + 16 + SIIS_SG_ENTRIES * 16) @@ -287,12 +282,12 @@ } __packed; struct siis_cmd_ata { - struct siis_dma_prd prd[1 + SIIS_SG_ENTRIES]; + struct siis_dma_prd prd[1]; } __packed; struct siis_cmd_atapi { u_int8_t ccb[16]; - struct siis_dma_prd prd[SIIS_SG_ENTRIES]; + struct siis_dma_prd prd[]; } __packed; struct siis_cmd { Index: sys/dev/siis/siis.c =================================================================== --- sys/dev/siis/siis.c +++ sys/dev/siis/siis.c @@ -1967,7 +1967,7 @@ cpi->transport_version = XPORT_VERSION_UNSPECIFIED; cpi->protocol = PROTO_ATA; cpi->protocol_version = PROTO_VERSION_UNSPECIFIED; - cpi->maxio = MAXPHYS; + cpi->maxio = maxphys; cpi->hba_vendor = pci_get_vendor(parent); cpi->hba_device = pci_get_device(parent); cpi->hba_subvendor = pci_get_subvendor(parent); Index: sys/fs/cd9660/cd9660_vfsops.c =================================================================== --- sys/fs/cd9660/cd9660_vfsops.c +++ sys/fs/cd9660/cd9660_vfsops.c @@ -238,8 +238,8 @@ goto out; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; - if (mp->mnt_iosize_max > MAXPHYS) - mp->mnt_iosize_max = MAXPHYS; + if (mp->mnt_iosize_max > maxphys) + mp->mnt_iosize_max = maxphys; bo = &devvp->v_bufobj; Index: sys/fs/ext2fs/ext2_vfsops.c =================================================================== --- sys/fs/ext2fs/ext2_vfsops.c +++ sys/fs/ext2fs/ext2_vfsops.c @@ -876,8 +876,8 @@ bo->bo_ops = g_vfs_bufops; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; - if (mp->mnt_iosize_max > MAXPHYS) - mp->mnt_iosize_max = MAXPHYS; + if (mp->mnt_iosize_max > maxphys) + mp->mnt_iosize_max = maxphys; bp = NULL; ump = NULL; Index: sys/fs/fuse/fuse_vfsops.c =================================================================== --- sys/fs/fuse/fuse_vfsops.c +++ sys/fs/fuse/fuse_vfsops.c @@ -441,7 +441,7 @@ } memset(mp->mnt_stat.f_mntfromname, 0, MNAMELEN); strlcpy(mp->mnt_stat.f_mntfromname, fspec, MNAMELEN); - mp->mnt_iosize_max = MAXPHYS; + mp->mnt_iosize_max = maxphys; /* Now handshaking with daemon */ fuse_internal_send_init(data, td); Index: sys/fs/msdosfs/msdosfs_vfsops.c =================================================================== --- sys/fs/msdosfs/msdosfs_vfsops.c +++ sys/fs/msdosfs/msdosfs_vfsops.c @@ -429,8 +429,8 @@ VOP_UNLOCK(devvp); if (dev->si_iosize_max != 0) mp->mnt_iosize_max = dev->si_iosize_max; - if (mp->mnt_iosize_max > MAXPHYS) - mp->mnt_iosize_max = MAXPHYS; + if (mp->mnt_iosize_max > maxphys) + mp->mnt_iosize_max = maxphys; /* * Read the boot sector of the filesystem, and then check the Index: sys/fs/udf/udf_vfsops.c =================================================================== --- sys/fs/udf/udf_vfsops.c +++ sys/fs/udf/udf_vfsops.c @@ -338,8 +338,8 @@ if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; - if (mp->mnt_iosize_max > MAXPHYS) - mp->mnt_iosize_max = MAXPHYS; + if (mp->mnt_iosize_max > maxphys) + mp->mnt_iosize_max = maxphys; /* XXX: should be M_WAITOK */ udfmp = malloc(sizeof(struct udf_mnt), M_UDFMOUNT, Index: sys/kern/kern_mib.c =================================================================== --- sys/kern/kern_mib.c +++ sys/kern/kern_mib.c @@ -146,8 +146,29 @@ SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW | CTLFLAG_MPSAFE, kernelname, sizeof kernelname, "Name of kernel file booted"); -SYSCTL_INT(_kern, KERN_MAXPHYS, maxphys, CTLFLAG_RD | CTLFLAG_CAPRD, - SYSCTL_NULL_INT_PTR, MAXPHYS, "Maximum block I/O access size"); +#ifdef COMPAT_FREEBSD12 +static int +sysctl_maxphys(SYSCTL_HANDLER_ARGS) +{ + u_long lvalue; + int ivalue; + + lvalue = maxphys; + if (sizeof(int) == sizeof(u_long) || req->oldlen >= sizeof(u_long)) + return (sysctl_handle_long(oidp, &lvalue, 0, req)); + if (lvalue > INT_MAX) + return (sysctl_handle_long(oidp, &lvalue, 0, req)); + ivalue = lvalue; + return (sysctl_handle_int(oidp, &ivalue, 0, req)); +} +SYSCTL_PROC(_kern, KERN_MAXPHYS, maxphys, CTLTYPE_LONG | CTLFLAG_RDTUN | + CTLFLAG_NOFETCH | CTLFLAG_CAPRD | CTLFLAG_MPSAFE, + NULL, 0, sysctl_maxphys, "UL", "Maximum block I/O access size"); +#else +SYSCTL_ULONG(_kern, KERN_MAXPHYS, maxphys, + CTLFLAG_RDTUN | CTLFLAG_NOFETCH | CTLFLAG_CAPRD, + &maxphys, 0, "Maximum block I/O access size"); +#endif SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_ncpus, 0, "Number of active CPUs"); Index: sys/kern/kern_physio.c =================================================================== --- sys/kern/kern_physio.c +++ sys/kern/kern_physio.c @@ -69,7 +69,7 @@ * need to reject any requests that will not fit into one buffer. */ if (dev->si_flags & SI_NOSPLIT && - (uio->uio_resid > dev->si_iosize_max || uio->uio_resid > MAXPHYS || + (uio->uio_resid > dev->si_iosize_max || uio->uio_resid > maxphys || uio->uio_iovcnt > 1)) { /* * Tell the user why his I/O was rejected. @@ -78,10 +78,10 @@ uprintf("%s: request size=%zd > si_iosize_max=%d; " "cannot split request\n", devtoname(dev), uio->uio_resid, dev->si_iosize_max); - if (uio->uio_resid > MAXPHYS) - uprintf("%s: request size=%zd > MAXPHYS=%d; " + if (uio->uio_resid > maxphys) + uprintf("%s: request size=%zd > maxphys=%lu; " "cannot split request\n", devtoname(dev), - uio->uio_resid, MAXPHYS); + uio->uio_resid, maxphys); if (uio->uio_iovcnt > 1) uprintf("%s: request vectors=%d > 1; " "cannot split request\n", devtoname(dev), @@ -101,12 +101,13 @@ pages = NULL; } else if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) { pbuf = NULL; - maxpages = btoc(MIN(uio->uio_resid, MAXPHYS)) + 1; + maxpages = btoc(MIN(uio->uio_resid, maxphys)) + 1; pages = malloc(sizeof(*pages) * maxpages, M_DEVBUF, M_WAITOK); } else { pbuf = uma_zalloc(pbuf_zone, M_WAITOK); + MPASS((pbuf->b_flags & B_MAXPHYS) != 0); sa = pbuf->b_data; - maxpages = btoc(MAXPHYS); + maxpages = btoc(maxphys); pages = pbuf->b_pages; } prot = VM_PROT_READ; @@ -144,13 +145,13 @@ bp->bio_length = uio->uio_iov[i].iov_len; if (bp->bio_length > dev->si_iosize_max) bp->bio_length = dev->si_iosize_max; - if (bp->bio_length > MAXPHYS) - bp->bio_length = MAXPHYS; + if (bp->bio_length > maxphys) + bp->bio_length = maxphys; /* * Make sure the pbuf can map the request. - * The pbuf has kvasize = MAXPHYS, so a request - * larger than MAXPHYS - PAGE_SIZE must be + * The pbuf has kvasize = maxphys, so a request + * larger than maxphys - PAGE_SIZE must be * page aligned or it will be fragmented. */ poff = (vm_offset_t)base & PAGE_MASK; Index: sys/kern/kern_sendfile.c =================================================================== --- sys/kern/kern_sendfile.c +++ sys/kern/kern_sendfile.c @@ -885,7 +885,7 @@ * do any heuristics and use exactly the value supplied by * application. Otherwise, we allow readahead up to "rem". * If application wants more, let it be, but there is no - * reason to go above MAXPHYS. Also check against "obj_size", + * reason to go above maxphys. Also check against "obj_size", * since vm_pager_has_page() can hint beyond EOF. */ if (flags & SF_USER_READAHEAD) { @@ -895,7 +895,7 @@ npages; rhpages += SF_READAHEAD(flags); } - rhpages = min(howmany(MAXPHYS, PAGE_SIZE), rhpages); + rhpages = min(howmany(maxphys, PAGE_SIZE), rhpages); rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) - npages, rhpages); Index: sys/kern/subr_param.c =================================================================== --- sys/kern/subr_param.c +++ sys/kern/subr_param.c @@ -99,9 +99,10 @@ int ngroups_max; /* max # groups per process */ int nswbuf; pid_t pid_max = PID_MAX; -long maxswzone; /* max swmeta KVA storage */ -long maxbcache; /* max buffer cache KVA storage */ -long maxpipekva; /* Limit on pipe KVA */ +u_long maxswzone; /* max swmeta KVA storage */ +u_long maxbcache; /* max buffer cache KVA storage */ +u_long maxpipekva; /* Limit on pipe KVA */ +u_long maxphys; int vm_guest = VM_GUEST_NO; /* Running as virtual machine guest? */ u_long maxtsiz; /* max text size */ u_long dfldsiz; /* initial data size limit */ @@ -289,6 +290,8 @@ nbuf = NBUF; TUNABLE_INT_FETCH("kern.nbuf", &nbuf); TUNABLE_INT_FETCH("kern.bio_transient_maxcnt", &bio_transient_maxcnt); + maxphys = MAXPHYS; + TUNABLE_ULONG_FETCH("kern.maxphys", &maxphys); /* * Physical buffers are pre-allocated buffers (struct buf) that @@ -300,7 +303,7 @@ * The default for maxpipekva is min(1/64 of the kernel address space, * max(1/64 of main memory, 512KB)). See sys_pipe.c for more details. */ - maxpipekva = (physpages / 64) * PAGE_SIZE; + maxpipekva = ptoa(physpages / 64); TUNABLE_LONG_FETCH("kern.ipc.maxpipekva", &maxpipekva); if (maxpipekva < 512 * 1024) maxpipekva = 512 * 1024; Index: sys/kern/vfs_aio.c =================================================================== --- sys/kern/vfs_aio.c +++ sys/kern/vfs_aio.c @@ -1252,14 +1252,16 @@ ki = p->p_aioinfo; poff = (vm_offset_t)cb->aio_buf & PAGE_MASK; if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) { - if (cb->aio_nbytes > MAXPHYS) { + if (cb->aio_nbytes > maxphys) { error = -1; goto unref; } pbuf = NULL; + job->pages = malloc(sizeof(vm_page_t) * atop(round_page( + cb->aio_nbytes)) + 1, M_TEMP, M_WAITOK | M_ZERO); } else { - if (cb->aio_nbytes > MAXPHYS - poff) { + if (cb->aio_nbytes > maxphys - poff) { error = -1; goto unref; } @@ -1273,6 +1275,7 @@ AIO_LOCK(ki); ki->kaio_buffer_count++; AIO_UNLOCK(ki); + job->pages = pbuf->b_pages; } job->bp = bp = g_alloc_bio(); @@ -1320,6 +1323,8 @@ AIO_UNLOCK(ki); uma_zfree(pbuf_zone, pbuf); job->pbuf = NULL; + } else { + free(job->pages, M_TEMP); } g_destroy_bio(bp); job->bp = NULL; @@ -2342,7 +2347,8 @@ /* Release mapping into kernel space. */ userp = job->userproc; ki = userp->p_aioinfo; - if (job->pbuf) { + vm_page_unhold_pages(job->pages, job->npages); + if (job->pbuf != NULL) { pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages); uma_zfree(pbuf_zone, job->pbuf); job->pbuf = NULL; @@ -2350,9 +2356,10 @@ AIO_LOCK(ki); ki->kaio_buffer_count--; AIO_UNLOCK(ki); - } else + } else { + free(job->pages, M_TEMP); atomic_subtract_int(&num_unmapped_aio, 1); - vm_page_unhold_pages(job->pages, job->npages); + } bp = job->bp; job->bp = NULL; Index: sys/kern/vfs_bio.c =================================================================== --- sys/kern/vfs_bio.c +++ sys/kern/vfs_bio.c @@ -147,8 +147,14 @@ #define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd))) #define BD_DOMAIN(bd) (bd - bdomain) -static struct buf *buf; /* buffer header pool */ -extern struct buf *swbuf; /* Swap buffer header pool. */ +static char *buf; /* buffer header pool */ +static struct buf * +nbufp(unsigned i) +{ + return ((struct buf *)(buf + (sizeof(struct buf) + + sizeof(vm_page_t) * atop(maxbcachebuf)) * i)); +} + caddr_t __read_mostly unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ @@ -994,8 +1000,8 @@ maxbcachebuf = i; if (maxbcachebuf < MAXBSIZE) maxbcachebuf = MAXBSIZE; - if (maxbcachebuf > MAXPHYS) - maxbcachebuf = MAXPHYS; + if (maxbcachebuf > maxphys) + maxbcachebuf = maxphys; if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF) printf("maxbcachebuf=%d\n", maxbcachebuf); } @@ -1113,10 +1119,10 @@ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } - if (biotmap_sz / INT_MAX > MAXPHYS) + if (biotmap_sz / INT_MAX > maxphys) bio_transient_maxcnt = INT_MAX; else - bio_transient_maxcnt = biotmap_sz / MAXPHYS; + bio_transient_maxcnt = biotmap_sz / maxphys; /* * Artificially limit to 1024 simultaneous in-flight I/Os * using the transient mapping. @@ -1136,10 +1142,11 @@ /* * Reserve space for the buffer cache buffers */ - buf = (void *)v; - v = (caddr_t)(buf + nbuf); + buf = (char *)v; + v = (caddr_t)buf + (sizeof(struct buf) + sizeof(vm_page_t) * + atop(maxbcachebuf)) * nbuf; - return(v); + return (v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ @@ -1157,12 +1164,12 @@ mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); - unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); + unmapped_buf = (caddr_t)kva_alloc(maxphys); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { - bp = &buf[i]; - bzero(bp, sizeof *bp); + bp = nbufp(i); + bzero(bp, sizeof(*bp) + sizeof(vm_page_t) * atop(maxbcachebuf)); bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; @@ -1246,7 +1253,8 @@ /* Setup the kva and free list allocators. */ vmem_set_reclaim(buffer_arena, bufkva_reclaim); - buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf), + buf_zone = uma_zcache_create("buf free cache", + sizeof(struct buf) + sizeof(vm_page_t) * atop(maxbcachebuf), NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); /* @@ -1295,7 +1303,7 @@ KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + - MAXPHYS, ("b_data + b_offset unmapped %p", bp)); + maxphys, ("b_data + b_offset unmapped %p", bp)); } static inline void @@ -1330,7 +1338,7 @@ { static int first_buf_printf = 1; struct buf *bp; - int iter, nbusy, pbusy; + int i, iter, nbusy, pbusy; #ifndef PREEMPTION int subiter; #endif @@ -1348,9 +1356,11 @@ */ for (iter = pbusy = 0; iter < 20; iter++) { nbusy = 0; - for (bp = &buf[nbuf]; --bp >= buf; ) + for (i = nbuf - 1; i >= 0; i--) { + bp = nbufp(i); if (isbufbusy(bp)) nbusy++; + } if (nbusy == 0) { if (first_buf_printf) printf("All buffers synced."); @@ -1391,7 +1401,8 @@ * a fsck if we're just a client of a wedged NFS server */ nbusy = 0; - for (bp = &buf[nbuf]; --bp >= buf; ) { + for (i = nbuf - 1; i >= 0; i--) { + bp = nbufp(i); if (isbufbusy(bp)) { #if 0 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ @@ -1571,6 +1582,7 @@ buf_deallocate(bp); bufkva_free(bp); atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1); + MPASS((bp->b_flags & B_MAXPHYS) == 0); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); } @@ -1674,6 +1686,7 @@ ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); + MPASS((bp->b_flags & B_MAXPHYS) == 0); bp->b_domain = BD_DOMAIN(bd); bp->b_flags = 0; @@ -2018,6 +2031,9 @@ KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); + MPASS((bp->b_flags & B_MAXPHYS) == 0); + KASSERT(maxsize <= maxbcachebuf, + ("bufkva_alloc kva too large %d %u", maxsize, maxbcachebuf)); bufkva_free(bp); @@ -3036,6 +3052,10 @@ */ obj = bp->b_bufobj->bo_object; if (bp->b_npages < desiredpages) { + KASSERT(desiredpages <= atop(maxbcachebuf), + ("vfs_vmio_extend past maxbcachebuf %p %d %u", + bp, desiredpages, maxbcachebuf)); + /* * We must allocate system pages since blocking * here could interfere with paging I/O, no @@ -3163,7 +3183,7 @@ (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; - maxcl = MAXPHYS / size; + maxcl = maxphys / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) @@ -4853,6 +4873,10 @@ to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; + MPASS((bp->b_flags & B_MAXPHYS) == 0); + KASSERT(to - from <= maxbcachebuf, + ("vm_hold_load_pages too large %p %#jx %#jx %u", + bp, (uintmax_t)from, (uintmax_t)to, maxbcachebuf)); for (pg = from; pg < to; pg += PAGE_SIZE, index++) { /* @@ -4912,12 +4936,12 @@ vm_prot_t prot; int pidx; + MPASS((bp->b_flags & B_MAXPHYS) != 0); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, - (vm_offset_t)uaddr, len, prot, bp->b_pages, - btoc(MAXPHYS))) < 0) + (vm_offset_t)uaddr, len, prot, bp->b_pages, btoc(maxphys))) < 0) return (-1); bp->b_bufsize = len; bp->b_npages = pidx; @@ -5398,19 +5422,23 @@ db_printf("\n"); cnt = 0; total = 0; - for (j = 0; j < nbuf; j++) - if (buf[j].b_domain == i && BUF_ISLOCKED(&buf[j])) { + for (j = 0; j < nbuf; j++) { + bp = nbufp(j); + if (bp->b_domain == i && BUF_ISLOCKED(bp)) { cnt++; - total += buf[j].b_bufsize; + total += bp->b_bufsize; } + } db_printf("\tLocked buffers: %d space %ld\n", cnt, total); cnt = 0; total = 0; - for (j = 0; j < nbuf; j++) - if (buf[j].b_domain == i) { + for (j = 0; j < nbuf; j++) { + bp = nbufp(j); + if (bp->b_domain == i) { cnt++; - total += buf[j].b_bufsize; + total += bp->b_bufsize; } + } db_printf("\tTotal buffers: %d space %ld\n", cnt, total); } } @@ -5421,7 +5449,7 @@ int i; for (i = 0; i < nbuf; i++) { - bp = &buf[i]; + bp = nbufp(i); if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); @@ -5464,7 +5492,7 @@ } for (i = 0; i < nbuf; i++) { - bp = &buf[i]; + bp = nbufp(i); if (bp->b_qindex == QUEUE_EMPTY) nfree++; else Index: sys/kern/vfs_cluster.c =================================================================== --- sys/kern/vfs_cluster.c +++ sys/kern/vfs_cluster.c @@ -386,6 +386,7 @@ bp = uma_zalloc(cluster_pbuf_zone, M_NOWAIT); if (bp == NULL) return tbp; + MPASS((bp->b_flags & B_MAXPHYS) != 0); /* * We are synthesizing a buffer out of vm_page_t's, but @@ -871,6 +872,7 @@ --len; continue; } + MPASS((bp->b_flags & B_MAXPHYS) != 0); /* * We got a pbuf to make the cluster in. Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -974,8 +974,8 @@ iosize = vap->va_blocksize; if (iosize == 0) iosize = BLKDEV_IOSIZE; - if (iosize > MAXPHYS) - iosize = MAXPHYS; + if (iosize > maxphys) + iosize = maxphys; buf = malloc(iosize, M_TEMP, M_WAITOK); #ifdef __notyet__ Index: sys/net/if.c =================================================================== --- sys/net/if.c +++ sys/net/if.c @@ -3162,8 +3162,8 @@ struct sbuf *sb; int error, full = 0, valid_len, max_len; - /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ - max_len = MAXPHYS - 1; + /* Limit initial buffer size to maxphys to avoid DoS from userspace. */ + max_len = maxphys - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) Index: sys/sys/aio.h =================================================================== --- sys/sys/aio.h +++ sys/sys/aio.h @@ -140,8 +140,8 @@ struct { /* BIO backend */ struct bio *bp; /* (*) BIO pointer */ struct buf *pbuf; /* (*) buffer pointer */ - struct vm_page *pages[btoc(MAXPHYS)+1]; /* (*) */ int npages; /* (*) number of pages */ + struct vm_page **pages; /* (*) */ }; struct { /* fsync() requests */ int pending; /* (a) number of pending I/O */ Index: sys/sys/buf.h =================================================================== --- sys/sys/buf.h +++ sys/sys/buf.h @@ -141,7 +141,6 @@ TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; } b_cluster; - struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* (D) List of filesystem dependencies. */ void *b_fsprivate1; @@ -156,6 +155,7 @@ #elif defined(BUF_TRACKING) const char *b_io_tracking; #endif + struct vm_page *b_pages[]; }; #define b_object b_bufobj->bo_object @@ -234,7 +234,7 @@ #define B_INVALONERR 0x00040000 /* Invalidate on write error. */ #define B_00080000 0x00080000 /* Available flag. */ #define B_00100000 0x00100000 /* Available flag. */ -#define B_00200000 0x00200000 /* Available flag. */ +#define B_MAXPHYS 0x00200000 /* nitems(b_pages[]) = atop(MAXPHYS). */ #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ #define B_FS_FLAG1 0x00800000 /* Available flag for FS use. */ #define B_NOCOPY 0x01000000 /* Don't copy-on-write this buf. */ @@ -247,7 +247,7 @@ #define B_REMFREE 0x80000000 /* Delayed bremfree */ #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \ - "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26b21\25b20" \ + "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26maxphys\25b20" \ "\24b19\23invalonerr\22clusterok\21malloc\20nocache\17b14\16inval" \ "\15reuse\14noreuse\13eintr\12done\11b8\10delwri" \ "\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age" @@ -496,8 +496,8 @@ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ -extern long maxswzone; /* Max KVA for swap structures */ -extern long maxbcache; /* Max KVA for buffer cache */ +extern u_long maxswzone; /* Max KVA for swap structures */ +extern u_long maxbcache; /* Max KVA for buffer cache */ extern int maxbcachebuf; /* Max buffer cache block size */ extern long runningbufspace; extern long hibufspace; Index: sys/sys/param.h =================================================================== --- sys/sys/param.h +++ sys/sys/param.h @@ -160,7 +160,7 @@ #define DFLTPHYS (64 * 1024) /* default max raw I/O transfer size */ #endif #ifndef MAXPHYS -#define MAXPHYS (128 * 1024) /* max raw I/O transfer size */ +#define MAXPHYS (1024 * 1024) /* max raw I/O transfer size */ #endif #ifndef MAXDUMPPGS #define MAXDUMPPGS (DFLTPHYS/PAGE_SIZE) Index: sys/sys/systm.h =================================================================== --- sys/sys/systm.h +++ sys/sys/systm.h @@ -74,6 +74,8 @@ extern int ngroups_max; /* max # of supplemental groups */ extern int vm_guest; /* Running as virtual machine guest? */ +extern u_long maxphys; + /* * Detected virtual machine guest types. The intention is to expand * and/or add to the VM_GUEST_VM type if specific VM functionality is Index: sys/ufs/ffs/ffs_vfsops.c =================================================================== --- sys/ufs/ffs/ffs_vfsops.c +++ sys/ufs/ffs/ffs_vfsops.c @@ -1055,8 +1055,8 @@ BO_UNLOCK(&odevvp->v_bufobj); if (dev->si_iosize_max != 0) mp->mnt_iosize_max = dev->si_iosize_max; - if (mp->mnt_iosize_max > MAXPHYS) - mp->mnt_iosize_max = MAXPHYS; + if (mp->mnt_iosize_max > maxphys) + mp->mnt_iosize_max = maxphys; if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) { error = EINVAL; vfs_mount_error(mp, Index: sys/vm/swap_pager.c =================================================================== --- sys/vm/swap_pager.c +++ sys/vm/swap_pager.c @@ -586,7 +586,7 @@ * but it isn't very efficient). * * The nsw_cluster_max is constrained by the bp->b_pages[] - * array, which has MAXPHYS / PAGE_SIZE entries, and our locally + * array, which has maxphys / PAGE_SIZE entries, and our locally * defined MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are * constrained by the swap device interleave stripe size. * @@ -601,7 +601,7 @@ * have one NFS swap device due to the command/ack latency over NFS. * So it all works out pretty well. */ - nsw_cluster_max = min(MAXPHYS / PAGE_SIZE, MAX_PAGEOUT_CLUSTER); + nsw_cluster_max = min(maxphys / PAGE_SIZE, MAX_PAGEOUT_CLUSTER); nsw_wcount_async = 4; nsw_wcount_async_max = nsw_wcount_async; @@ -1314,6 +1314,7 @@ VM_OBJECT_WUNLOCK(object); bp = uma_zalloc(swrbuf_zone, M_WAITOK); + MPASS((bp->b_flags & B_MAXPHYS) != 0); /* Pages cannot leave the object while busy. */ for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) { MPASS(p->pindex == bm->pindex + i); @@ -1522,8 +1523,9 @@ VM_OBJECT_WUNLOCK(object); bp = uma_zalloc(swwbuf_zone, M_WAITOK); + MPASS((bp->b_flags & B_MAXPHYS) != 0); if (async) - bp->b_flags = B_ASYNC; + bp->b_flags |= B_ASYNC; bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_WRITE; Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -115,7 +115,6 @@ #define PFFOR 4 #define VM_FAULT_READ_DEFAULT (1 + VM_FAULT_READ_AHEAD_INIT) -#define VM_FAULT_READ_MAX (1 + VM_FAULT_READ_AHEAD_MAX) #define VM_FAULT_DONTNEED_MIN 1048576 Index: sys/vm/vm_init.c =================================================================== --- sys/vm/vm_init.c +++ sys/vm/vm_init.c @@ -212,7 +212,7 @@ /* * Allocate the clean map to hold all of I/O virtual memory. */ - size = (long)nbuf * BKVASIZE + (long)bio_transient_maxcnt * MAXPHYS; + size = (long)nbuf * BKVASIZE + (long)bio_transient_maxcnt * maxphys; kmi->clean_sva = firstaddr = kva_alloc(size); kmi->clean_eva = firstaddr + size; @@ -233,7 +233,7 @@ * And optionally transient bio space. */ if (bio_transient_maxcnt != 0) { - size = (long)bio_transient_maxcnt * MAXPHYS; + size = (long)bio_transient_maxcnt * maxphys; vmem_init(transient_arena, "transient arena", firstaddr, size, PAGE_SIZE, 0, 0); firstaddr += size; Index: sys/vm/vm_map.h =================================================================== --- sys/vm/vm_map.h +++ sys/vm/vm_map.h @@ -396,7 +396,7 @@ */ #define VM_FAULT_READ_AHEAD_MIN 7 #define VM_FAULT_READ_AHEAD_INIT 15 -#define VM_FAULT_READ_AHEAD_MAX min(atop(MAXPHYS) - 1, UINT8_MAX) +#define VM_FAULT_READ_AHEAD_MAX min(atop(maxphys) - 1, UINT8_MAX) /* * The following "find_space" options are supported by vm_map_find(). Index: sys/vm/vm_pager.c =================================================================== --- sys/vm/vm_pager.c +++ sys/vm/vm_pager.c @@ -183,7 +183,8 @@ { /* Main zone for paging bufs. */ - pbuf_zone = uma_zcreate("pbuf", sizeof(struct buf), + pbuf_zone = uma_zcreate("pbuf", + sizeof(struct buf) + atop(maxphys) * sizeof(vm_page_t), pbuf_ctor, pbuf_dtor, pbuf_init, NULL, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); /* Few systems may still use this zone directly, so it needs a limit. */ @@ -384,7 +385,7 @@ bp->b_qindex = 0; /* On no queue (QUEUE_NONE) */ bp->b_data = bp->b_kvabase; bp->b_xflags = 0; - bp->b_flags = 0; + bp->b_flags = B_MAXPHYS; bp->b_ioflags = 0; bp->b_iodone = NULL; bp->b_error = 0; @@ -415,10 +416,10 @@ { struct buf *bp = mem; - bp->b_kvabase = (void *)kva_alloc(MAXPHYS); + bp->b_kvabase = (void *)kva_alloc(maxphys); if (bp->b_kvabase == NULL) return (ENOMEM); - bp->b_kvasize = MAXPHYS; + bp->b_kvasize = maxphys; BUF_LOCKINIT(bp); LIST_INIT(&bp->b_dep); bp->b_rcred = bp->b_wcred = NOCRED; Index: sys/vm/vnode_pager.c =================================================================== --- sys/vm/vnode_pager.c +++ sys/vm/vnode_pager.c @@ -817,7 +817,7 @@ KASSERT(foff < object->un_pager.vnp.vnp_size, ("%s: page %p offset beyond vp %p size", __func__, m[0], vp)); - KASSERT(count <= nitems(bp->b_pages), + KASSERT(count <= atop(maxphys), ("%s: requested %d pages", __func__, count)); /* @@ -832,6 +832,7 @@ } bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK); + MPASS((bp->b_flags & B_MAXPHYS) != 0); /* * Get the underlying device blocks for the file with VOP_BMAP(). @@ -916,10 +917,10 @@ * Check that total amount of pages fit into buf. Trim rbehind and * rahead evenly if not. */ - if (rbehind + rahead + count > nitems(bp->b_pages)) { + if (rbehind + rahead + count > atop(maxphys)) { int trim, sum; - trim = rbehind + rahead + count - nitems(bp->b_pages) + 1; + trim = rbehind + rahead + count - atop(maxphys) + 1; sum = rbehind + rahead; if (rbehind == before) { /* Roundup rbehind trim to block size. */ @@ -930,9 +931,9 @@ rbehind -= trim * rbehind / sum; rahead -= trim * rahead / sum; } - KASSERT(rbehind + rahead + count <= nitems(bp->b_pages), - ("%s: behind %d ahead %d count %d", __func__, - rbehind, rahead, count)); + KASSERT(rbehind + rahead + count <= atop(maxphys), + ("%s: behind %d ahead %d count %d maxphys %lu", __func__, + rbehind, rahead, count, maxphys)); /* * Fill in the bp->b_pages[] array with requested and optional @@ -1014,7 +1015,7 @@ *a_rahead = bp->b_pgafter; #ifdef INVARIANTS - KASSERT(bp->b_npages <= nitems(bp->b_pages), + KASSERT(bp->b_npages <= atop(maxphys), ("%s: buf %p overflowed", __func__, bp)); for (int j = 1, prev = 0; j < bp->b_npages; j++) { if (bp->b_pages[j] == bogus_page)