Index: head/lib/libstand/stand.h =================================================================== --- head/lib/libstand/stand.h (revision 316577) +++ head/lib/libstand/stand.h (revision 316578) @@ -1,422 +1,425 @@ /* * Copyright (c) 1998 Michael Smith. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * From $NetBSD: stand.h,v 1.22 1997/06/26 19:17:40 drochner Exp $ */ /*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)stand.h 8.1 (Berkeley) 6/11/93 */ #ifndef STAND_H #define STAND_H #include #include #include #include /* this header intentionally exports NULL from */ #include #define CHK(fmt, args...) printf("%s(%d): " fmt "\n", __func__, __LINE__ , ##args) #define PCHK(fmt, args...) {printf("%s(%d): " fmt "\n", __func__, __LINE__ , ##args); getchar();} /* Avoid unwanted userlandish components */ #define _KERNEL #include #undef _KERNEL /* special stand error codes */ #define EADAPT (ELAST+1) /* bad adaptor */ #define ECTLR (ELAST+2) /* bad controller */ #define EUNIT (ELAST+3) /* bad unit */ #define ESLICE (ELAST+4) /* bad slice */ #define EPART (ELAST+5) /* bad partition */ #define ERDLAB (ELAST+6) /* can't read disk label */ #define EUNLAB (ELAST+7) /* unlabeled disk */ #define EOFFSET (ELAST+8) /* relative seek not supported */ #define ESALAST (ELAST+8) /* */ struct open_file; /* * This structure is used to define file system operations in a file system * independent way. * * XXX note that filesystem providers should export a pointer to their fs_ops * struct, so that consumers can reference this and thus include the * filesystems that they require. */ struct fs_ops { const char *fs_name; int (*fo_open)(const char *path, struct open_file *f); int (*fo_close)(struct open_file *f); int (*fo_read)(struct open_file *f, void *buf, size_t size, size_t *resid); int (*fo_write)(struct open_file *f, void *buf, size_t size, size_t *resid); off_t (*fo_seek)(struct open_file *f, off_t offset, int where); int (*fo_stat)(struct open_file *f, struct stat *sb); int (*fo_readdir)(struct open_file *f, struct dirent *d); }; /* * libstand-supplied filesystems */ extern struct fs_ops ufs_fsops; extern struct fs_ops tftp_fsops; extern struct fs_ops nfs_fsops; extern struct fs_ops cd9660_fsops; extern struct fs_ops nandfs_fsops; extern struct fs_ops gzipfs_fsops; extern struct fs_ops bzipfs_fsops; extern struct fs_ops dosfs_fsops; extern struct fs_ops ext2fs_fsops; extern struct fs_ops splitfs_fsops; extern struct fs_ops pkgfs_fsops; /* where values for lseek(2) */ #define SEEK_SET 0 /* set file offset to offset */ #define SEEK_CUR 1 /* set file offset to current plus offset */ #define SEEK_END 2 /* set file offset to EOF plus offset */ /* * Device switch */ struct devsw { const char dv_name[8]; int dv_type; /* opaque type constant, arch-dependant */ int (*dv_init)(void); /* early probe call */ int (*dv_strategy)(void *devdata, int rw, daddr_t blk, size_t size, char *buf, size_t *rsize); int (*dv_open)(struct open_file *f, ...); int (*dv_close)(struct open_file *f); int (*dv_ioctl)(struct open_file *f, u_long cmd, void *data); int (*dv_print)(int verbose); /* print device information */ void (*dv_cleanup)(void); }; /* * libstand-supplied device switch */ extern struct devsw netdev; extern int errno; /* * Generic device specifier; architecture-dependent * versions may be larger, but should be allowed to * overlap. */ struct devdesc { struct devsw *d_dev; int d_type; #define DEVT_NONE 0 #define DEVT_DISK 1 #define DEVT_NET 2 #define DEVT_CD 3 #define DEVT_ZFS 4 #define DEVT_FD 5 int d_unit; void *d_opendata; }; struct open_file { int f_flags; /* see F_* below */ struct devsw *f_dev; /* pointer to device operations */ void *f_devdata; /* device specific data */ struct fs_ops *f_ops; /* pointer to file system operations */ void *f_fsdata; /* file system specific data */ off_t f_offset; /* current file offset */ char *f_rabuf; /* readahead buffer pointer */ size_t f_ralen; /* valid data in readahead buffer */ off_t f_raoffset; /* consumer offset in readahead buffer */ #define SOPEN_RASIZE 512 }; #define SOPEN_MAX 64 extern struct open_file files[]; /* f_flags values */ #define F_READ 0x0001 /* file opened for reading */ #define F_WRITE 0x0002 /* file opened for writing */ #define F_RAW 0x0004 /* raw device open - no file system */ #define F_NODEV 0x0008 /* network open - no device */ +#define F_MASK 0xFFFF +/* Mode modifier for strategy() */ +#define F_NORA (0x01 << 16) /* Disable Read-Ahead */ #define isascii(c) (((c) & ~0x7F) == 0) static __inline int isupper(int c) { return c >= 'A' && c <= 'Z'; } static __inline int islower(int c) { return c >= 'a' && c <= 'z'; } static __inline int isspace(int c) { return c == ' ' || (c >= 0x9 && c <= 0xd); } static __inline int isdigit(int c) { return c >= '0' && c <= '9'; } static __inline int isxdigit(int c) { return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } static __inline int isalpha(int c) { return isupper(c) || islower(c); } static __inline int isalnum(int c) { return isalpha(c) || isdigit(c); } static __inline int toupper(int c) { return islower(c) ? c - 'a' + 'A' : c; } static __inline int tolower(int c) { return isupper(c) ? c - 'A' + 'a' : c; } /* sbrk emulation */ extern void setheap(void *base, void *top); extern char *sbrk(int incr); /* Matt Dillon's zalloc/zmalloc */ extern void *malloc(size_t bytes); extern void free(void *ptr); /*#define free(p) {CHK("free %p", p); free(p);} */ /* use for catching guard violations */ extern void *calloc(size_t n1, size_t n2); extern void *realloc(void *ptr, size_t size); extern void *reallocf(void *ptr, size_t size); extern void mallocstats(void); extern int printf(const char *fmt, ...) __printflike(1, 2); extern void vprintf(const char *fmt, __va_list); extern int sprintf(char *buf, const char *cfmt, ...) __printflike(2, 3); extern int snprintf(char *buf, size_t size, const char *cfmt, ...) __printflike(3, 4); extern void vsprintf(char *buf, const char *cfmt, __va_list); extern void twiddle(u_int callerdiv); extern void twiddle_divisor(u_int globaldiv); extern void ngets(char *, int); #define gets(x) ngets((x), 0) extern int fgetstr(char *buf, int size, int fd); extern int open(const char *, int); #define O_RDONLY 0x0 #define O_WRONLY 0x1 #define O_RDWR 0x2 extern int close(int); extern void closeall(void); extern ssize_t read(int, void *, size_t); extern ssize_t write(int, void *, size_t); extern struct dirent *readdirfd(int); extern void srandom(u_long seed); extern u_long random(void); /* imports from stdlib, locally modified */ extern long strtol(const char *, char **, int); extern unsigned long strtoul(const char *, char **, int); extern char *optarg; /* getopt(3) external variables */ extern int optind, opterr, optopt, optreset; extern int getopt(int, char * const [], const char *); /* pager.c */ extern void pager_open(void); extern void pager_close(void); extern int pager_output(const char *lines); extern int pager_file(const char *fname); /* No signal state to preserve */ #define setjmp _setjmp #define longjmp _longjmp /* environment.c */ #define EV_DYNAMIC (1<<0) /* value was dynamically allocated, free if changed/unset */ #define EV_VOLATILE (1<<1) /* value is volatile, make a copy of it */ #define EV_NOHOOK (1<<2) /* don't call hook when setting */ struct env_var; typedef char *(ev_format_t)(struct env_var *ev); typedef int (ev_sethook_t)(struct env_var *ev, int flags, const void *value); typedef int (ev_unsethook_t)(struct env_var *ev); struct env_var { char *ev_name; int ev_flags; void *ev_value; ev_sethook_t *ev_sethook; ev_unsethook_t *ev_unsethook; struct env_var *ev_next, *ev_prev; }; extern struct env_var *environ; extern struct env_var *env_getenv(const char *name); extern int env_setenv(const char *name, int flags, const void *value, ev_sethook_t sethook, ev_unsethook_t unsethook); extern char *getenv(const char *name); extern int setenv(const char *name, const char *value, int overwrite); extern int putenv(const char *string); extern int unsetenv(const char *name); extern ev_sethook_t env_noset; /* refuse set operation */ extern ev_unsethook_t env_nounset; /* refuse unset operation */ /* BCD conversions (undocumented) */ extern u_char const bcd2bin_data[]; extern u_char const bin2bcd_data[]; extern char const hex2ascii_data[]; #define bcd2bin(bcd) (bcd2bin_data[bcd]) #define bin2bcd(bin) (bin2bcd_data[bin]) #define hex2ascii(hex) (hex2ascii_data[hex]) /* min/max (undocumented) */ static __inline int imax(int a, int b) { return (a > b ? a : b); } static __inline int imin(int a, int b) { return (a < b ? a : b); } static __inline long lmax(long a, long b) { return (a > b ? a : b); } static __inline long lmin(long a, long b) { return (a < b ? a : b); } static __inline u_int max(u_int a, u_int b) { return (a > b ? a : b); } static __inline u_int min(u_int a, u_int b) { return (a < b ? a : b); } static __inline quad_t qmax(quad_t a, quad_t b) { return (a > b ? a : b); } static __inline quad_t qmin(quad_t a, quad_t b) { return (a < b ? a : b); } static __inline u_long ulmax(u_long a, u_long b) { return (a > b ? a : b); } static __inline u_long ulmin(u_long a, u_long b) { return (a < b ? a : b); } /* null functions for device/filesystem switches (undocumented) */ extern int nodev(void); extern int noioctl(struct open_file *, u_long, void *); extern void nullsys(void); extern int null_open(const char *path, struct open_file *f); extern int null_close(struct open_file *f); extern int null_read(struct open_file *f, void *buf, size_t size, size_t *resid); extern int null_write(struct open_file *f, void *buf, size_t size, size_t *resid); extern off_t null_seek(struct open_file *f, off_t offset, int where); extern int null_stat(struct open_file *f, struct stat *sb); extern int null_readdir(struct open_file *f, struct dirent *d); /* * Machine dependent functions and data, must be provided or stubbed by * the consumer */ extern int getchar(void); extern int ischar(void); extern void putchar(int); extern int devopen(struct open_file *, const char *, const char **); extern int devclose(struct open_file *f); extern void panic(const char *, ...) __dead2 __printflike(1, 2); extern struct fs_ops *file_system[]; extern struct fs_ops *exclusive_file_system; extern struct devsw *devsw[]; /* * Expose byteorder(3) functions. */ #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED extern uint32_t htonl(uint32_t); extern uint16_t htons(uint16_t); extern uint32_t ntohl(uint32_t); extern uint16_t ntohs(uint16_t); #endif #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif void *Malloc(size_t, const char *, int); void *Calloc(size_t, size_t, const char *, int); void *Realloc(void *, size_t, const char *, int); void Free(void *, const char *, int); #if 1 #define malloc(x) Malloc(x, __FILE__, __LINE__) #define calloc(x, y) Calloc(x, y, __FILE__, __LINE__) #define free(x) Free(x, __FILE__, __LINE__) #define realloc(x, y) Realloc(x, y, __FILE__, __LINE__) #else #define malloc(x) Malloc(x, NULL, 0) #define calloc(x, y) Calloc(x, y, NULL, 0) #define free(x) Free(x, NULL, 0) #define realloc(x, y) Realloc(x, y, NULL, 0) #endif #endif /* STAND_H */ Index: head/sys/boot/common/bcache.c =================================================================== --- head/sys/boot/common/bcache.c (revision 316577) +++ head/sys/boot/common/bcache.c (revision 316578) @@ -1,497 +1,503 @@ /*- * Copyright (c) 1998 Michael Smith * Copyright 2015 Toomas Soome * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include __FBSDID("$FreeBSD$"); /* * Simple hashed block cache */ #include #include #include #include #include "bootstrap.h" /* #define BCACHE_DEBUG */ #ifdef BCACHE_DEBUG # define DEBUG(fmt, args...) printf("%s: " fmt "\n" , __func__ , ## args) #else # define DEBUG(fmt, args...) #endif struct bcachectl { daddr_t bc_blkno; int bc_count; }; /* * bcache per device node. cache is allocated on device first open and freed * on last close, to save memory. The issue there is the size; biosdisk * supports up to 31 (0x1f) devices. Classic setup would use single disk * to boot from, but this has changed with zfs. */ struct bcache { struct bcachectl *bcache_ctl; caddr_t bcache_data; size_t bcache_nblks; size_t ra; }; static u_int bcache_total_nblks; /* set by bcache_init */ static u_int bcache_blksize; /* set by bcache_init */ static u_int bcache_numdev; /* set by bcache_add_dev */ /* statistics */ static u_int bcache_units; /* number of devices with cache */ static u_int bcache_unit_nblks; /* nblocks per unit */ static u_int bcache_hits; static u_int bcache_misses; static u_int bcache_ops; static u_int bcache_bypasses; static u_int bcache_bcount; static u_int bcache_rablks; #define BHASH(bc, blkno) ((blkno) & ((bc)->bcache_nblks - 1)) #define BCACHE_LOOKUP(bc, blkno) \ ((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno)) #define BCACHE_READAHEAD 256 #define BCACHE_MINREADAHEAD 32 #define BCACHE_MARKER 0xdeadbeef static void bcache_invalidate(struct bcache *bc, daddr_t blkno); static void bcache_insert(struct bcache *bc, daddr_t blkno); static void bcache_free_instance(struct bcache *bc); /* * Initialise the cache for (nblks) of (bsize). */ void bcache_init(size_t nblks, size_t bsize) { /* set up control data */ bcache_total_nblks = nblks; bcache_blksize = bsize; } /* * add number of devices to bcache. we have to divide cache space * between the devices, so bcache_add_dev() can be used to set up the * number. The issue is, we need to get the number before actual allocations. * bcache_add_dev() is supposed to be called from device init() call, so the * assumption is, devsw dv_init is called for plain devices first, and * for zfs, last. */ void bcache_add_dev(int devices) { bcache_numdev += devices; } void * bcache_allocate(void) { u_int i; struct bcache *bc = malloc(sizeof (struct bcache)); int disks = bcache_numdev; uint32_t *marker; if (disks == 0) disks = 1; /* safe guard */ if (bc == NULL) { errno = ENOMEM; return (bc); } /* * the bcache block count must be power of 2 for hash function */ i = fls(disks) - 1; /* highbit - 1 */ if (disks > (1 << i)) /* next power of 2 */ i++; bc->bcache_nblks = bcache_total_nblks >> i; bcache_unit_nblks = bc->bcache_nblks; bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize + sizeof(uint32_t)); if (bc->bcache_data == NULL) { /* dont error out yet. fall back to 32 blocks and try again */ bc->bcache_nblks = 32; bc->bcache_data = malloc(bc->bcache_nblks * bcache_blksize + sizeof(uint32_t)); } bc->bcache_ctl = malloc(bc->bcache_nblks * sizeof(struct bcachectl)); if ((bc->bcache_data == NULL) || (bc->bcache_ctl == NULL)) { bcache_free_instance(bc); errno = ENOMEM; return (NULL); } /* Insert cache end marker. */ marker = (uint32_t *)(bc->bcache_data + bc->bcache_nblks * bcache_blksize); *marker = BCACHE_MARKER; /* Flush the cache */ for (i = 0; i < bc->bcache_nblks; i++) { bc->bcache_ctl[i].bc_count = -1; bc->bcache_ctl[i].bc_blkno = -1; } bcache_units++; bc->ra = BCACHE_READAHEAD; /* optimistic read ahead */ return (bc); } void bcache_free(void *cache) { struct bcache *bc = cache; if (bc == NULL) return; bcache_free_instance(bc); bcache_units--; } /* * Handle a write request; write directly to the disk, and populate the * cache with the new values. */ static int write_strategy(void *devdata, int rw, daddr_t blk, size_t size, char *buf, size_t *rsize) { struct bcache_devdata *dd = (struct bcache_devdata *)devdata; struct bcache *bc = dd->dv_cache; daddr_t i, nblk; nblk = size / bcache_blksize; /* Invalidate the blocks being written */ for (i = 0; i < nblk; i++) { bcache_invalidate(bc, blk + i); } /* Write the blocks */ return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); } /* * Handle a read request; fill in parts of the request that can * be satisfied by the cache, use the supplied strategy routine to do * device I/O and then use the I/O results to populate the cache. */ static int read_strategy(void *devdata, int rw, daddr_t blk, size_t size, char *buf, size_t *rsize) { struct bcache_devdata *dd = (struct bcache_devdata *)devdata; struct bcache *bc = dd->dv_cache; size_t i, nblk, p_size, r_size, complete, ra; int result; daddr_t p_blk; caddr_t p_buf; uint32_t *marker; if (bc == NULL) { errno = ENODEV; return (-1); } marker = (uint32_t *)(bc->bcache_data + bc->bcache_nblks * bcache_blksize); if (rsize != NULL) *rsize = 0; nblk = size / bcache_blksize; if (nblk == 0 && size != 0) nblk++; result = 0; complete = 1; /* Satisfy any cache hits up front, break on first miss */ for (i = 0; i < nblk; i++) { if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) { bcache_misses += (nblk - i); complete = 0; if (nblk - i > BCACHE_MINREADAHEAD && bc->ra > BCACHE_MINREADAHEAD) bc->ra >>= 1; /* reduce read ahead */ break; } else { bcache_hits++; } } if (complete) { /* whole set was in cache, return it */ if (bc->ra < BCACHE_READAHEAD) bc->ra <<= 1; /* increase read ahead */ bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size); goto done; } /* * Fill in any misses. From check we have i pointing to first missing * block, read in all remaining blocks + readahead. * We have space at least for nblk - i before bcache wraps. */ p_blk = blk + i; p_buf = bc->bcache_data + (bcache_blksize * BHASH(bc, p_blk)); r_size = bc->bcache_nblks - BHASH(bc, p_blk); /* remaining blocks */ p_size = MIN(r_size, nblk - i); /* read at least those blocks */ /* * The read ahead size setup. * While the read ahead can save us IO, it also can complicate things: * 1. We do not want to read ahead by wrapping around the * bcache end - this would complicate the cache management. * 2. We are using bc->ra as dynamic hint for read ahead size, * detected cache hits will increase the read-ahead block count, and * misses will decrease, see the code above. * 3. The bcache is sized by 512B blocks, however, the underlying device * may have a larger sector size, and we should perform the IO by * taking into account these larger sector sizes. We could solve this by * passing the sector size to bcache_allocate(), or by using ioctl(), but * in this version we are using the constant, 16 blocks, and are rounding * read ahead block count down to multiple of 16. * Using the constant has two reasons, we are not entirely sure if the * BIOS disk interface is providing the correct value for sector size. * And secondly, this way we get the most conservative setup for the ra. * * The selection of multiple of 16 blocks (8KB) is quite arbitrary, however, * we want to cover CDs (2K) and 4K disks. * bcache_allocate() will always fall back to a minimum of 32 blocks. * Our choice of 16 read ahead blocks will always fit inside the bcache. */ - ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size); + if ((rw & F_NORA) == F_NORA) + ra = 0; + else + ra = bc->bcache_nblks - BHASH(bc, p_blk + p_size); + if (ra != 0 && ra != bc->bcache_nblks) { /* do we have RA space? */ ra = MIN(bc->ra, ra - 1); ra = rounddown(ra, 16); /* multiple of 16 blocks */ p_size += ra; } /* invalidate bcache */ for (i = 0; i < p_size; i++) { bcache_invalidate(bc, p_blk + i); } r_size = 0; /* * with read-ahead, it may happen we are attempting to read past * disk end, as bcache has no information about disk size. * in such case we should get partial read if some blocks can be * read or error, if no blocks can be read. * in either case we should return the data in bcache and only * return error if there is no data. */ + rw &= F_MASK; result = dd->dv_strategy(dd->dv_devdata, rw, p_blk, p_size * bcache_blksize, p_buf, &r_size); r_size /= bcache_blksize; for (i = 0; i < r_size; i++) bcache_insert(bc, p_blk + i); /* update ra statistics */ if (r_size != 0) { if (r_size < p_size) bcache_rablks += (p_size - r_size); else bcache_rablks += ra; } /* check how much data can we copy */ for (i = 0; i < nblk; i++) { if (BCACHE_LOOKUP(bc, (daddr_t)(blk + i))) break; } if (size > i * bcache_blksize) size = i * bcache_blksize; if (size != 0) { bcopy(bc->bcache_data + (bcache_blksize * BHASH(bc, blk)), buf, size); result = 0; } if (*marker != BCACHE_MARKER) { printf("BUG: bcache corruption detected: nblks: %zu p_blk: %lu, " "p_size: %zu, ra: %zu\n", bc->bcache_nblks, (long unsigned)BHASH(bc, p_blk), p_size, ra); } done: if ((result == 0) && (rsize != NULL)) *rsize = size; return(result); } /* * Requests larger than 1/2 cache size will be bypassed and go * directly to the disk. XXX tune this. */ int bcache_strategy(void *devdata, int rw, daddr_t blk, size_t size, char *buf, size_t *rsize) { struct bcache_devdata *dd = (struct bcache_devdata *)devdata; struct bcache *bc = dd->dv_cache; u_int bcache_nblks = 0; int nblk, cblk, ret; size_t csize, isize, total; bcache_ops++; if (bc != NULL) bcache_nblks = bc->bcache_nblks; /* bypass large requests, or when the cache is inactive */ if (bc == NULL || ((size * 2 / bcache_blksize) > bcache_nblks)) { DEBUG("bypass %zu from %qu", size / bcache_blksize, blk); bcache_bypasses++; + rw &= F_MASK; return (dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); } - switch (rw) { + switch (rw & F_MASK) { case F_READ: nblk = size / bcache_blksize; if (size != 0 && nblk == 0) nblk++; /* read at least one block */ ret = 0; total = 0; while(size) { cblk = bcache_nblks - BHASH(bc, blk); /* # of blocks left */ cblk = MIN(cblk, nblk); if (size <= bcache_blksize) csize = size; else csize = cblk * bcache_blksize; ret = read_strategy(devdata, rw, blk, csize, buf+total, &isize); /* * we may have error from read ahead, if we have read some data * return partial read. */ if (ret != 0 || isize == 0) { if (total != 0) ret = 0; break; } blk += isize / bcache_blksize; total += isize; size -= isize; nblk = size / bcache_blksize; } if (rsize) *rsize = total; return (ret); case F_WRITE: - return write_strategy(devdata, rw, blk, size, buf, rsize); + return write_strategy(devdata, F_WRITE, blk, size, buf, rsize); } return -1; } /* * Free allocated bcache instance */ static void bcache_free_instance(struct bcache *bc) { if (bc != NULL) { if (bc->bcache_ctl) free(bc->bcache_ctl); if (bc->bcache_data) free(bc->bcache_data); free(bc); } } /* * Insert a block into the cache. */ static void bcache_insert(struct bcache *bc, daddr_t blkno) { u_int cand; cand = BHASH(bc, blkno); DEBUG("insert blk %llu -> %u # %d", blkno, cand, bcache_bcount); bc->bcache_ctl[cand].bc_blkno = blkno; bc->bcache_ctl[cand].bc_count = bcache_bcount++; } /* * Invalidate a block from the cache. */ static void bcache_invalidate(struct bcache *bc, daddr_t blkno) { u_int i; i = BHASH(bc, blkno); if (bc->bcache_ctl[i].bc_blkno == blkno) { bc->bcache_ctl[i].bc_count = -1; bc->bcache_ctl[i].bc_blkno = -1; DEBUG("invalidate blk %llu", blkno); } } #ifndef BOOT2 COMMAND_SET(bcachestat, "bcachestat", "get disk block cache stats", command_bcache); static int command_bcache(int argc, char *argv[]) { if (argc != 1) { command_errmsg = "wrong number of arguments"; return(CMD_ERROR); } printf("\ncache blocks: %d\n", bcache_total_nblks); printf("cache blocksz: %d\n", bcache_blksize); printf("cache readahead: %d\n", bcache_rablks); printf("unit cache blocks: %d\n", bcache_unit_nblks); printf("cached units: %d\n", bcache_units); printf("%d ops %d bypasses %d hits %d misses\n", bcache_ops, bcache_bypasses, bcache_hits, bcache_misses); return(CMD_OK); } #endif Index: head/sys/boot/common/disk.c =================================================================== --- head/sys/boot/common/disk.c (revision 316577) +++ head/sys/boot/common/disk.c (revision 316578) @@ -1,421 +1,426 @@ /*- * Copyright (c) 1998 Michael Smith * Copyright (c) 2012 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include "disk.h" #ifdef DISK_DEBUG # define DEBUG(fmt, args...) printf("%s: " fmt "\n" , __func__ , ## args) #else # define DEBUG(fmt, args...) #endif struct open_disk { struct ptable *table; uint64_t mediasize; uint64_t entrysize; u_int sectorsize; }; struct print_args { struct disk_devdesc *dev; const char *prefix; int verbose; }; /* Convert size to a human-readable number. */ static char * display_size(uint64_t size, u_int sectorsize) { static char buf[80]; char unit; size = size * sectorsize / 1024; unit = 'K'; if (size >= 10485760000LL) { size /= 1073741824; unit = 'T'; } else if (size >= 10240000) { size /= 1048576; unit = 'G'; } else if (size >= 10000) { size /= 1024; unit = 'M'; } sprintf(buf, "%ld%cB", (long)size, unit); return (buf); } int ptblread(void *d, void *buf, size_t blocks, uint64_t offset) { struct disk_devdesc *dev; struct open_disk *od; dev = (struct disk_devdesc *)d; od = (struct open_disk *)dev->d_opendata; - return (dev->d_dev->dv_strategy(dev, F_READ, offset, + + /* + * As the GPT backup partition is located at the end of the disk, + * to avoid reading past disk end, flag bcache not to use RA. + */ + return (dev->d_dev->dv_strategy(dev, F_READ | F_NORA, offset, blocks * od->sectorsize, (char *)buf, NULL)); } #define PWIDTH 35 static int ptable_print(void *arg, const char *pname, const struct ptable_entry *part) { struct disk_devdesc dev; struct print_args *pa, bsd; struct open_disk *od; struct ptable *table; char line[80]; int res; pa = (struct print_args *)arg; od = (struct open_disk *)pa->dev->d_opendata; sprintf(line, " %s%s: %s", pa->prefix, pname, parttype2str(part->type)); if (pa->verbose) sprintf(line, "%-*s%s", PWIDTH, line, display_size(part->end - part->start + 1, od->sectorsize)); strcat(line, "\n"); if (pager_output(line)) return 1; res = 0; if (part->type == PART_FREEBSD) { /* Open slice with BSD label */ dev.d_dev = pa->dev->d_dev; dev.d_unit = pa->dev->d_unit; dev.d_slice = part->index; dev.d_partition = -1; if (disk_open(&dev, part->end - part->start + 1, od->sectorsize) == 0) { table = ptable_open(&dev, part->end - part->start + 1, od->sectorsize, ptblread); if (table != NULL) { sprintf(line, " %s%s", pa->prefix, pname); bsd.dev = pa->dev; bsd.prefix = line; bsd.verbose = pa->verbose; res = ptable_iterate(table, &bsd, ptable_print); ptable_close(table); } disk_close(&dev); } } return (res); } #undef PWIDTH int disk_print(struct disk_devdesc *dev, char *prefix, int verbose) { struct open_disk *od; struct print_args pa; /* Disk should be opened */ od = (struct open_disk *)dev->d_opendata; pa.dev = dev; pa.prefix = prefix; pa.verbose = verbose; return (ptable_iterate(od->table, &pa, ptable_print)); } int disk_read(struct disk_devdesc *dev, void *buf, uint64_t offset, u_int blocks) { struct open_disk *od; int ret; od = (struct open_disk *)dev->d_opendata; ret = dev->d_dev->dv_strategy(dev, F_READ, dev->d_offset + offset, blocks * od->sectorsize, buf, NULL); return (ret); } int disk_write(struct disk_devdesc *dev, void *buf, uint64_t offset, u_int blocks) { struct open_disk *od; int ret; od = (struct open_disk *)dev->d_opendata; ret = dev->d_dev->dv_strategy(dev, F_WRITE, dev->d_offset + offset, blocks * od->sectorsize, buf, NULL); return (ret); } int disk_ioctl(struct disk_devdesc *dev, u_long cmd, void *data) { struct open_disk *od = dev->d_opendata; if (od == NULL) return (ENOTTY); switch (cmd) { case DIOCGSECTORSIZE: *(u_int *)data = od->sectorsize; break; case DIOCGMEDIASIZE: if (dev->d_offset == 0) *(uint64_t *)data = od->mediasize; else *(uint64_t *)data = od->entrysize * od->sectorsize; break; default: return (ENOTTY); } return (0); } int disk_open(struct disk_devdesc *dev, uint64_t mediasize, u_int sectorsize) { struct open_disk *od; struct ptable *table; struct ptable_entry part; int rc, slice, partition; rc = 0; /* * While we are reading disk metadata, make sure we do it relative * to the start of the disk */ dev->d_offset = 0; table = NULL; slice = dev->d_slice; partition = dev->d_partition; od = (struct open_disk *)malloc(sizeof(struct open_disk)); if (od == NULL) { DEBUG("no memory"); return (ENOMEM); } dev->d_opendata = od; od->entrysize = 0; od->mediasize = mediasize; od->sectorsize = sectorsize; DEBUG("%s unit %d, slice %d, partition %d => %p", disk_fmtdev(dev), dev->d_unit, dev->d_slice, dev->d_partition, od); /* Determine disk layout. */ od->table = ptable_open(dev, mediasize / sectorsize, sectorsize, ptblread); if (od->table == NULL) { DEBUG("Can't read partition table"); rc = ENXIO; goto out; } if (ptable_getsize(od->table, &mediasize) != 0) { rc = ENXIO; goto out; } if (mediasize > od->mediasize) { od->mediasize = mediasize; } if (ptable_gettype(od->table) == PTABLE_BSD && partition >= 0) { /* It doesn't matter what value has d_slice */ rc = ptable_getpart(od->table, &part, partition); if (rc == 0) { dev->d_offset = part.start; od->entrysize = part.end - part.start + 1; } } else if (slice >= 0) { /* Try to get information about partition */ if (slice == 0) rc = ptable_getbestpart(od->table, &part); else rc = ptable_getpart(od->table, &part, slice); if (rc != 0) /* Partition doesn't exist */ goto out; dev->d_offset = part.start; od->entrysize = part.end - part.start + 1; slice = part.index; if (ptable_gettype(od->table) == PTABLE_GPT) { partition = 255; goto out; /* Nothing more to do */ } else if (partition == 255) { /* * When we try to open GPT partition, but partition * table isn't GPT, reset d_partition value to -1 * and try to autodetect appropriate value. */ partition = -1; } /* * If d_partition < 0 and we are looking at a BSD slice, * then try to read BSD label, otherwise return the * whole MBR slice. */ if (partition == -1 && part.type != PART_FREEBSD) goto out; /* Try to read BSD label */ table = ptable_open(dev, part.end - part.start + 1, od->sectorsize, ptblread); if (table == NULL) { DEBUG("Can't read BSD label"); rc = ENXIO; goto out; } /* * If slice contains BSD label and d_partition < 0, then * assume the 'a' partition. Otherwise just return the * whole MBR slice, because it can contain ZFS. */ if (partition < 0) { if (ptable_gettype(table) != PTABLE_BSD) goto out; partition = 0; } rc = ptable_getpart(table, &part, partition); if (rc != 0) goto out; dev->d_offset += part.start; od->entrysize = part.end - part.start + 1; } out: if (table != NULL) ptable_close(table); if (rc != 0) { if (od->table != NULL) ptable_close(od->table); free(od); DEBUG("%s could not open", disk_fmtdev(dev)); } else { /* Save the slice and partition number to the dev */ dev->d_slice = slice; dev->d_partition = partition; DEBUG("%s offset %lld => %p", disk_fmtdev(dev), (long long)dev->d_offset, od); } return (rc); } int disk_close(struct disk_devdesc *dev) { struct open_disk *od; od = (struct open_disk *)dev->d_opendata; DEBUG("%s closed => %p", disk_fmtdev(dev), od); ptable_close(od->table); free(od); return (0); } char* disk_fmtdev(struct disk_devdesc *dev) { static char buf[128]; char *cp; cp = buf + sprintf(buf, "%s%d", dev->d_dev->dv_name, dev->d_unit); if (dev->d_slice >= 0) { #ifdef LOADER_GPT_SUPPORT if (dev->d_partition == 255) { sprintf(cp, "p%d:", dev->d_slice); return (buf); } else #endif #ifdef LOADER_MBR_SUPPORT cp += sprintf(cp, "s%d", dev->d_slice); #endif } if (dev->d_partition >= 0) cp += sprintf(cp, "%c", dev->d_partition + 'a'); strcat(cp, ":"); return (buf); } int disk_parsedev(struct disk_devdesc *dev, const char *devspec, const char **path) { int unit, slice, partition; const char *np; char *cp; np = devspec; unit = slice = partition = -1; if (*np != '\0' && *np != ':') { unit = strtol(np, &cp, 10); if (cp == np) return (EUNIT); #ifdef LOADER_GPT_SUPPORT if (*cp == 'p') { np = cp + 1; slice = strtol(np, &cp, 10); if (np == cp) return (ESLICE); /* we don't support nested partitions on GPT */ if (*cp != '\0' && *cp != ':') return (EINVAL); partition = 255; } else #endif #ifdef LOADER_MBR_SUPPORT if (*cp == 's') { np = cp + 1; slice = strtol(np, &cp, 10); if (np == cp) return (ESLICE); } #endif if (*cp != '\0' && *cp != ':') { partition = *cp - 'a'; if (partition < 0) return (EPART); cp++; } } else return (EINVAL); if (*cp != '\0' && *cp != ':') return (EINVAL); dev->d_unit = unit; dev->d_slice = slice; dev->d_partition = partition; if (path != NULL) *path = (*cp == '\0') ? cp: cp + 1; return (0); }