Index: head/stand/defs.mk =================================================================== --- head/stand/defs.mk (revision 326447) +++ head/stand/defs.mk (revision 326448) @@ -1,216 +1,218 @@ # $FreeBSD$ .include +WARNS?=0 + .if !defined(__BOOT_DEFS_MK__) __BOOT_DEFS_MK__=${MFILE} BOOTSRC= ${SRCTOP}/stand EFISRC= ${BOOTSRC}/efi EFIINC= ${EFISRC}/include EFIINCMD= ${EFIINC}/${MACHINE} FDTSRC= ${BOOTSRC}/fdt FICLSRC= ${BOOTSRC}/ficl LDRSRC= ${BOOTSRC}/common SASRC= ${BOOTSRC}/libsa SYSDIR= ${SRCTOP}/sys UBOOTSRC= ${BOOTSRC}/uboot ZFSSRC= ${BOOTSRC}/zfs BOOTOBJ= ${OBJTOP}/stand # BINDIR is where we install BINDIR?= /boot # NB: The makefiles depend on these being empty when we don't build forth. .if ${MK_FORTH} != "no" LIBFICL= ${BOOTOBJ}/ficl/libficl.a .if ${MACHINE} == "i386" LIBFICL32= ${LIBFICL} .else LIBFICL32= ${BOOTOBJ}/ficl32/libficl.a .endif .endif LIBSA= ${BOOTOBJ}/libsa/libsa.a .if ${MACHINE} == "i386" LIBSA32= ${LIBSA} .else LIBSA32= ${BOOTOBJ}/libsa32/libsa32.a .endif # Standard options: CFLAGS+= -I${SASRC} -D_STANDALONE CFLAGS+= -I${SYSDIR} # Filesystem support .if ${LOADER_CD9660_SUPPORT:Uno} == "yes" CFLAGS+= -DLOADER_CD9660_SUPPORT .endif .if ${LOADER_EXT2FS_SUPPORT:Uno} == "yes" CFLAGS+= -DLOADER_EXT2FS_SUPPORT .endif .if ${LOADER_MSDOS_SUPPORT:Uno} == "yes" CFLAGS+= -DLOADER_MSDOS_SUPPORT .endif .if ${LOADER_NANDFS_SUPPORT:U${MK_NAND}} == "yes" CFLAGS+= -DLOADER_NANDFS_SUPPORT .endif .if ${LOADER_UFS_SUPPORT:Uyes} == "yes" CFLAGS+= -DLOADER_UFS_SUPPORT .endif # Compression .if ${LOADER_GZIP_SUPPORT:Uno} == "yes" CFLAGS+= -DLOADER_GZIP_SUPPORT .endif .if ${LOADER_BZIP2_SUPPORT:Uno} == "yes" CFLAGS+= -DLOADER_BZIP2_SUPPORT .endif # Network related things .if ${LOADER_NET_SUPPORT:Uno} == "yes" CFLAGS+= -DLOADER_NET_SUPPORT .endif .if ${LOADER_NFS_SUPPORT:Uno} == "yes" CFLAGS+= -DLOADER_NFS_SUPPORT .endif .if ${LOADER_TFTP_SUPPORT:Uno} == "yes" CFLAGS+= -DLOADER_TFTP_SUPPORT .endif # Disk and partition support .if ${LOADER_DISK_SUPPORT:Uyes} == "yes" CFLAGS+= -DLOADER_DISK_SUPPORT .if ${LOADER_GPT_SUPPORT:Uyes} == "yes" CFLAGS+= -DLOADER_GPT_SUPPORT .endif .if ${LOADER_MBR_SUPPORT:Uyes} == "yes" CFLAGS+= -DLOADER_MBR_SUPPORT .endif # GELI Support, with backward compat hooks .if defined(HAVE_GELI) .if defined(LOADER_NO_GELI_SUPPORT) MK_LOADER_GELI=no .warning "Please move from LOADER_NO_GELI_SUPPORT to WITHOUT_LOADER_GELI" .endif .if defined(LOADER_GELI_SUPPORT) MK_LOADER_GELI=yes .warning "Please move from LOADER_GELI_SUPPORT to WITH_LOADER_GELI" .endif .if ${MK_LOADER_GELI} == "yes" CFLAGS+= -DLOADER_GELI_SUPPORT CFLAGS+= -I${BOOTSRC}/geli LIBGELIBOOT= ${BOOTOBJ}/geli/libgeliboot.a .endif .endif .endif # Machine specific flags for all builds here # All PowerPC builds are 32 bit. We have no 64-bit loaders on powerpc # or powerpc64. .if ${MACHINE_ARCH} == "powerpc64" CFLAGS+= -m32 -mcpu=powerpc .endif # For amd64, there's a bit of mixed bag. Some of the tree (i386, lib*32) is # build 32-bit and some 64-bit (lib*, efi). Centralize all the 32-bit magic here # and activate it when DO32 is explicitly defined to be 1. .if ${MACHINE_ARCH} == "amd64" && ${DO32:U0} == 1 CFLAGS+= -m32 -mcpu=i386 # LD_FLAGS is passed directly to ${LD}, not via ${CC}: LD_FLAGS+= -m elf_i386_fbsd AFLAGS+= --32 .endif SSP_CFLAGS= # Add in the no float / no SIMD stuff and announce we're freestanding # aarch64 and riscv don't have -msoft-float, but all others do. riscv # currently has no /boot/loader, but may soon. CFLAGS+= -ffreestanding ${CFLAGS_NO_SIMD} .if ${MACHINE_CPUARCH} == "aarch64" CFLAGS+= -mgeneral-regs-only .elif ${MACHINE_CPUARCH} != "riscv" CFLAGS+= -msoft-float .endif .if ${MACHINE_CPUARCH} == "i386" || (${MACHINE_CPUARCH} == "amd64" && ${DO32:U0} == 1) CFLAGS+= -march=i386 CFLAGS.gcc+= -mpreferred-stack-boundary=2 .endif .if ${MACHINE_CPUARCH} == "arm" # Do not generate movt/movw, because the relocation fixup for them does not # translate to the -Bsymbolic -pie format required by self_reloc() in loader(8). # Also, the fpu is not available in a standalone environment. .if ${COMPILER_VERSION} < 30800 CFLAGS.clang+= -mllvm -arm-use-movt=0 .else CFLAGS.clang+= -mno-movt .endif CFLAGS.clang+= -mfpu=none .endif # The boot loader build uses dd status=none, where possible, for reproducible # build output (since performance varies from run to run). Trouble is that # option was recently (10.3) added to FreeBSD and is non-standard. Only use it # when this test succeeds rather than require dd to be a bootstrap tool. DD_NOSTATUS!=(dd status=none count=0 2> /dev/null && echo status=none) || true DD=dd ${DD_NOSTATUS} .if ${MK_LOADER_FORCE_LE} != "no" .if ${MACHINE_ARCH} == "powerpc64" CFLAGS+= -mlittle-endian .endif .endif # Make sure we use the machine link we're about to create CFLAGS+=-I. _ILINKS=machine .if ${MACHINE} != ${MACHINE_CPUARCH} && ${MACHINE} != "arm64" _ILINKS+=${MACHINE_CPUARCH} .endif .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" _ILINKS+=x86 .endif CLEANFILES+=${_ILINKS} all: ${PROG} beforedepend: ${_ILINKS} beforebuild: ${_ILINKS} # Ensure that the links exist without depending on it when it exists which # causes all the modules to be rebuilt when the directory pointed to changes. .for _link in ${_ILINKS} .if !exists(${.OBJDIR}/${_link}) ${OBJS}: ${_link} .endif .endfor .NOPATH: ${_ILINKS} ${_ILINKS}: @case ${.TARGET} in \ machine) \ if [ ${DO32:U0} -eq 0 ]; then \ path=${SYSDIR}/${MACHINE}/include ; \ else \ path=${SYSDIR}/${MACHINE:C/amd64/i386/}/include ; \ fi ;; \ *) \ path=${SYSDIR}/${.TARGET:T}/include ;; \ esac ; \ path=`(cd $$path && /bin/pwd)` ; \ ${ECHO} ${.TARGET:T} "->" $$path ; \ ln -fhs $$path ${.TARGET:T} # For loader implementations, we generate a loader.help file. This can be suppressed by # setting HELP_FILES to nothing. HELP_FILES= ${LDRSRC}/help.common .endif # __BOOT_DEFS_MK__ Index: head/stand/geli/Makefile =================================================================== --- head/stand/geli/Makefile (revision 326447) +++ head/stand/geli/Makefile (revision 326448) @@ -1,45 +1,43 @@ # $FreeBSD$ # libgeliboot MAN= DO32=1 .include MK_SSP= no LIB= geliboot INTERNALLIB= MK_PROFILE= no NO_PIC= -WARNS?= 0 - # Our password input method SRCS+= pwgets.c # sha256 and sha512 from sys/crypto .PATH: ${SYSDIR}/crypto/sha2 CFLAGS+= -DWEAK_REFS SRCS+= sha256c.c sha512c.c # md5 from libmd .PATH: ${SRCTOP}/lib/libmd SRCS+= md5c.c # AES implementation from sys/crypto .PATH: ${SYSDIR}/crypto/rijndael CFLAGS+= -I${LDRSRC} # Remove asserts CFLAGS+= -DNDEBUG SRCS+= rijndael-alg-fst.c rijndael-api-fst.c rijndael-api.c # local GELI Implementation .PATH: ${SYSDIR}/geom/eli SRCS+= geliboot_crypto.c g_eli_hmac.c g_eli_key.c g_eli_key_cache.c pkcs5v2.c # aes .PATH: ${SYSDIR}/opencrypto SRCS+= xform_aes_xts.c .include .include Index: head/stand/geli/geliboot.c =================================================================== --- head/stand/geli/geliboot.c (revision 326447) +++ head/stand/geli/geliboot.c (revision 326448) @@ -1,437 +1,437 @@ /*- * Copyright (c) 2015 Allan Jude * Copyright (c) 2005-2011 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "geliboot_internal.h" #include "geliboot.h" SLIST_HEAD(geli_list, geli_entry) geli_head = SLIST_HEAD_INITIALIZER(geli_head); struct geli_list *geli_headp; typedef u_char geli_ukey[G_ELI_USERKEYLEN]; static geli_ukey saved_keys[GELI_MAX_KEYS]; static unsigned int nsaved_keys = 0; /* * Copy keys from local storage to the keybuf struct. * Destroy the local storage when finished. */ void geli_fill_keybuf(struct keybuf *fkeybuf) { unsigned int i; for (i = 0; i < nsaved_keys; i++) { fkeybuf->kb_ents[i].ke_type = KEYBUF_TYPE_GELI; memcpy(fkeybuf->kb_ents[i].ke_data, saved_keys[i], G_ELI_USERKEYLEN); } fkeybuf->kb_nents = nsaved_keys; explicit_bzero(saved_keys, sizeof(saved_keys)); } /* * Copy keys from a keybuf struct into local storage. * Zero out the keybuf. */ void geli_save_keybuf(struct keybuf *skeybuf) { unsigned int i; for (i = 0; i < skeybuf->kb_nents && i < GELI_MAX_KEYS; i++) { memcpy(saved_keys[i], skeybuf->kb_ents[i].ke_data, G_ELI_USERKEYLEN); explicit_bzero(skeybuf->kb_ents[i].ke_data, G_ELI_USERKEYLEN); skeybuf->kb_ents[i].ke_type = KEYBUF_TYPE_NONE; } nsaved_keys = skeybuf->kb_nents; skeybuf->kb_nents = 0; } static void save_key(geli_ukey key) { /* * If we run out of key space, the worst that will happen is * it will ask the user for the password again. */ if (nsaved_keys < GELI_MAX_KEYS) { memcpy(saved_keys[nsaved_keys], key, G_ELI_USERKEYLEN); nsaved_keys++; } } static int geli_same_device(struct geli_entry *ge, struct dsk *dskp) { if (ge->dsk->drive == dskp->drive && dskp->part == 255 && ge->dsk->part == dskp->slice) { /* * Sometimes slice = slice, and sometimes part = slice * If the incoming struct dsk has part=255, it means look at * the slice instead of the part number */ return (0); } /* Is this the same device? */ if (ge->dsk->drive != dskp->drive || ge->dsk->slice != dskp->slice || ge->dsk->part != dskp->part) { return (1); } return (0); } static int geli_findkey(struct geli_entry *ge, struct dsk *dskp, u_char *mkey) { u_int keynum; int i; if (ge->keybuf_slot >= 0) { if (g_eli_mkey_decrypt(&ge->md, saved_keys[ge->keybuf_slot], mkey, &keynum) == 0) { return (0); } } for (i = 0; i < nsaved_keys; i++) { if (g_eli_mkey_decrypt(&ge->md, saved_keys[i], mkey, &keynum) == 0) { ge->keybuf_slot = i; return (0); } } return (1); } void geli_init(void) { geli_count = 0; SLIST_INIT(&geli_head); } /* * Read the last sector of the drive or partition pointed to by dsk and see * if it is GELI encrypted */ int geli_taste(int read_func(void *vdev, void *priv, off_t off, void *buf, size_t bytes), struct dsk *dskp, daddr_t lastsector) { struct g_eli_metadata md; u_char buf[DEV_GELIBOOT_BSIZE]; int error; off_t alignsector; alignsector = rounddown2(lastsector * DEV_BSIZE, DEV_GELIBOOT_BSIZE); if (alignsector + DEV_GELIBOOT_BSIZE > ((lastsector + 1) * DEV_BSIZE)) { /* Don't read past the end of the disk */ alignsector = (lastsector * DEV_BSIZE) + DEV_BSIZE - DEV_GELIBOOT_BSIZE; } error = read_func(NULL, dskp, alignsector, &buf, DEV_GELIBOOT_BSIZE); if (error != 0) { return (error); } /* Extract the last 4k sector of the disk. */ error = eli_metadata_decode(buf, &md); if (error != 0) { /* Try the last 512 byte sector instead. */ error = eli_metadata_decode(buf + (DEV_GELIBOOT_BSIZE - DEV_BSIZE), &md); if (error != 0) { return (error); } } if (!(md.md_flags & G_ELI_FLAG_GELIBOOT)) { /* The GELIBOOT feature is not activated */ return (1); } if ((md.md_flags & G_ELI_FLAG_ONETIME)) { /* Swap device, skip it. */ return (1); } if (md.md_iterations < 0) { /* XXX TODO: Support loading key files. */ /* Disk does not have a passphrase, skip it. */ return (1); } geli_e = malloc(sizeof(struct geli_entry)); if (geli_e == NULL) return (2); geli_e->dsk = malloc(sizeof(struct dsk)); if (geli_e->dsk == NULL) return (2); memcpy(geli_e->dsk, dskp, sizeof(struct dsk)); geli_e->part_end = lastsector; if (dskp->part == 255) { geli_e->dsk->part = dskp->slice; } geli_e->keybuf_slot = -1; geli_e->md = md; eli_metadata_softc(&geli_e->sc, &md, DEV_BSIZE, (lastsector + DEV_BSIZE) * DEV_BSIZE); SLIST_INSERT_HEAD(&geli_head, geli_e, entries); geli_count++; return (0); } /* * Attempt to decrypt the device */ static int geli_attach(struct geli_entry *ge, struct dsk *dskp, const char *passphrase, - const u_char *mkeyp) + u_char *mkeyp) { u_char key[G_ELI_USERKEYLEN], mkey[G_ELI_DATAIVKEYLEN], *mkp; u_int keynum; struct hmac_ctx ctx; int error; if (mkeyp != NULL) { memcpy(&mkey, mkeyp, G_ELI_DATAIVKEYLEN); explicit_bzero(mkeyp, G_ELI_DATAIVKEYLEN); } if (mkeyp != NULL || geli_findkey(ge, dskp, mkey) == 0) { goto found_key; } g_eli_crypto_hmac_init(&ctx, NULL, 0); /* * Prepare Derived-Key from the user passphrase. */ if (geli_e->md.md_iterations < 0) { /* XXX TODO: Support loading key files. */ return (1); } else if (geli_e->md.md_iterations == 0) { g_eli_crypto_hmac_update(&ctx, geli_e->md.md_salt, sizeof(geli_e->md.md_salt)); - g_eli_crypto_hmac_update(&ctx, passphrase, + g_eli_crypto_hmac_update(&ctx, (const uint8_t *)passphrase, strlen(passphrase)); } else if (geli_e->md.md_iterations > 0) { printf("Calculating GELI Decryption Key disk%dp%d @ %d" " iterations...\n", dskp->unit, (dskp->slice > 0 ? dskp->slice : dskp->part), geli_e->md.md_iterations); u_char dkey[G_ELI_USERKEYLEN]; pkcs5v2_genkey(dkey, sizeof(dkey), geli_e->md.md_salt, sizeof(geli_e->md.md_salt), passphrase, geli_e->md.md_iterations); g_eli_crypto_hmac_update(&ctx, dkey, sizeof(dkey)); explicit_bzero(dkey, sizeof(dkey)); } g_eli_crypto_hmac_final(&ctx, key, 0); error = g_eli_mkey_decrypt(&geli_e->md, key, mkey, &keynum); if (error == -1) { explicit_bzero(mkey, sizeof(mkey)); explicit_bzero(key, sizeof(key)); printf("Bad GELI key: bad password?\n"); return (error); } else if (error != 0) { explicit_bzero(mkey, sizeof(mkey)); explicit_bzero(key, sizeof(key)); printf("Failed to decrypt GELI master key: %d\n", error); return (error); } else { /* Add key to keychain */ save_key(key); explicit_bzero(&key, sizeof(key)); } found_key: /* Store the keys */ bcopy(mkey, geli_e->sc.sc_mkey, sizeof(geli_e->sc.sc_mkey)); bcopy(mkey, geli_e->sc.sc_ivkey, sizeof(geli_e->sc.sc_ivkey)); mkp = mkey + sizeof(geli_e->sc.sc_ivkey); if ((geli_e->sc.sc_flags & G_ELI_FLAG_AUTH) == 0) { bcopy(mkp, geli_e->sc.sc_ekey, G_ELI_DATAKEYLEN); } else { /* * The encryption key is: ekey = HMAC_SHA512(Data-Key, 0x10) */ - g_eli_crypto_hmac(mkp, G_ELI_MAXKEYLEN, "\x10", 1, + g_eli_crypto_hmac(mkp, G_ELI_MAXKEYLEN, (const uint8_t *)"\x10", 1, geli_e->sc.sc_ekey, 0); } explicit_bzero(mkey, sizeof(mkey)); /* Initialize the per-sector IV. */ switch (geli_e->sc.sc_ealgo) { case CRYPTO_AES_XTS: break; default: SHA256_Init(&geli_e->sc.sc_ivctx); SHA256_Update(&geli_e->sc.sc_ivctx, geli_e->sc.sc_ivkey, sizeof(geli_e->sc.sc_ivkey)); break; } return (0); } int is_geli(struct dsk *dskp) { SLIST_FOREACH_SAFE(geli_e, &geli_head, entries, geli_e_tmp) { if (geli_same_device(geli_e, dskp) == 0) { return (0); } } return (1); } int geli_read(struct dsk *dskp, off_t offset, u_char *buf, size_t bytes) { u_char iv[G_ELI_IVKEYLEN]; u_char *pbuf; int error; off_t dstoff; uint64_t keyno; size_t n, nsec, secsize; struct g_eli_key gkey; pbuf = buf; SLIST_FOREACH_SAFE(geli_e, &geli_head, entries, geli_e_tmp) { if (geli_same_device(geli_e, dskp) != 0) { continue; } secsize = geli_e->sc.sc_sectorsize; nsec = bytes / secsize; if (nsec == 0) { /* * A read of less than the GELI sector size has been * requested. The caller provided destination buffer may * not be big enough to boost the read to a full sector, * so just attempt to decrypt the truncated sector. */ secsize = bytes; nsec = 1; } for (n = 0, dstoff = offset; n < nsec; n++, dstoff += secsize) { g_eli_crypto_ivgen(&geli_e->sc, dstoff, iv, G_ELI_IVKEYLEN); /* Get the key that corresponds to this offset. */ keyno = (dstoff >> G_ELI_KEY_SHIFT) / secsize; g_eli_key_fill(&geli_e->sc, &gkey, keyno); error = geliboot_crypt(geli_e->sc.sc_ealgo, 0, pbuf, secsize, gkey.gek_key, geli_e->sc.sc_ekeylen, iv); if (error != 0) { explicit_bzero(&gkey, sizeof(gkey)); printf("Failed to decrypt in geli_read()!"); return (error); } pbuf += secsize; } explicit_bzero(&gkey, sizeof(gkey)); return (0); } printf("GELI provider not found\n"); return (1); } int geli_havekey(struct dsk *dskp) { u_char mkey[G_ELI_DATAIVKEYLEN]; SLIST_FOREACH_SAFE(geli_e, &geli_head, entries, geli_e_tmp) { if (geli_same_device(geli_e, dskp) != 0) { continue; } if (geli_findkey(geli_e, dskp, mkey) == 0) { if (geli_attach(geli_e, dskp, NULL, mkey) == 0) { return (0); } } } explicit_bzero(mkey, sizeof(mkey)); return (1); } int geli_passphrase(char *pw, int disk, int parttype, int part, struct dsk *dskp) { int i; SLIST_FOREACH_SAFE(geli_e, &geli_head, entries, geli_e_tmp) { if (geli_same_device(geli_e, dskp) != 0) { continue; } /* TODO: Implement GELI keyfile(s) support */ for (i = 0; i < 3; i++) { /* Try cached passphrase */ if (i == 0 && pw[0] != '\0') { if (geli_attach(geli_e, dskp, pw, NULL) == 0) { return (0); } } printf("GELI Passphrase for disk%d%c%d: ", disk, parttype, part); pwgets(pw, GELI_PW_MAXLEN, (geli_e->md.md_flags & G_ELI_FLAG_GELIDISPLAYPASS) == 0); printf("\n"); if (geli_attach(geli_e, dskp, pw, NULL) == 0) { return (0); } } } return (1); } Index: head/stand/i386/gptboot/gptboot.c =================================================================== --- head/stand/i386/gptboot/gptboot.c (revision 326447) +++ head/stand/i386/gptboot/gptboot.c (revision 326448) @@ -1,648 +1,654 @@ /*- * Copyright (c) 1998 Robert Nordier * All rights reserved. * * Redistribution and use in source and binary forms are freely * permitted provided that the above copyright notice and this * paragraph and the following disclaimer are duplicated in all * such forms. * * This software is provided "AS IS" and without any express or * implied warranties, including, without limitation, the implied * warranties of merchantability and fitness for a particular * purpose. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include +/* Forward declared to avoid warnings -- these shouldn't be needed */ +int strcasecmp(const char *s1, const char *s2); +void explicit_bzero(void *b, size_t len); + #include "bootargs.h" #include "lib.h" #include "rbx.h" #include "drv.h" #include "util.h" #include "cons.h" #include "gpt.h" #include "paths.h" #define ARGS 0x900 #define NOPT 14 #define NDEV 3 #define MEM_BASE 0x12 #define MEM_EXT 0x15 #define DRV_HARD 0x80 #define DRV_MASK 0x7f #define TYPE_AD 0 #define TYPE_DA 1 #define TYPE_MAXHARD TYPE_DA #define TYPE_FD 2 extern uint32_t _end; static const uuid_t freebsd_ufs_uuid = GPT_ENT_TYPE_FREEBSD_UFS; static const char optstr[NOPT] = "DhaCcdgmnpqrsv"; /* Also 'P', 'S' */ static const unsigned char flags[NOPT] = { RBX_DUAL, RBX_SERIAL, RBX_ASKNAME, RBX_CDROM, RBX_CONFIG, RBX_KDB, RBX_GDB, RBX_MUTE, RBX_NOINTR, RBX_PAUSE, RBX_QUIET, RBX_DFLTROOT, RBX_SINGLE, RBX_VERBOSE }; uint32_t opts; static const char *const dev_nm[NDEV] = {"ad", "da", "fd"}; static const unsigned char dev_maj[NDEV] = {30, 4, 2}; static struct dsk dsk; static char kname[1024]; static int comspeed = SIOSPD; static struct bootinfo bootinfo; #ifdef LOADER_GELI_SUPPORT static struct geli_boot_args geliargs; #endif static vm_offset_t high_heap_base; static uint32_t bios_basemem, bios_extmem, high_heap_size; static struct bios_smap smap; /* * The minimum amount of memory to reserve in bios_extmem for the heap. */ #define HEAP_MIN (3 * 1024 * 1024) static char *heap_next; static char *heap_end; void exit(int); static void load(void); static int parse_cmds(char *, int *); static int dskread(void *, daddr_t, unsigned); void *malloc(size_t n); void free(void *ptr); #ifdef LOADER_GELI_SUPPORT static int vdev_read(void *vdev __unused, void *priv, off_t off, void *buf, size_t bytes); #endif void * malloc(size_t n) { char *p = heap_next; if (p + n > heap_end) { printf("malloc failure\n"); for (;;) ; /* NOTREACHED */ return (0); } heap_next += n; return (p); } void free(void *ptr) { return; } #include "ufsread.c" #include "gpt.c" #ifdef LOADER_GELI_SUPPORT #include "geliboot.c" static char gelipw[GELI_PW_MAXLEN]; static struct keybuf *gelibuf; #endif static inline int xfsread(ufs_ino_t inode, void *buf, size_t nbyte) { if ((size_t)fsread(inode, buf, nbyte) != nbyte) { printf("Invalid %s\n", "format"); return (-1); } return (0); } static void bios_getmem(void) { uint64_t size; /* Parse system memory map */ v86.ebx = 0; do { v86.ctl = V86_FLAGS; v86.addr = MEM_EXT; /* int 0x15 function 0xe820*/ v86.eax = 0xe820; v86.ecx = sizeof(struct bios_smap); v86.edx = SMAP_SIG; v86.es = VTOPSEG(&smap); v86.edi = VTOPOFF(&smap); v86int(); if ((v86.efl & 1) || (v86.eax != SMAP_SIG)) break; /* look for a low-memory segment that's large enough */ if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base == 0) && (smap.length >= (512 * 1024))) bios_basemem = smap.length; /* look for the first segment in 'extended' memory */ if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base == 0x100000)) { bios_extmem = smap.length; } /* * Look for the largest segment in 'extended' memory beyond * 1MB but below 4GB. */ if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base > 0x100000) && (smap.base < 0x100000000ull)) { size = smap.length; /* * If this segment crosses the 4GB boundary, truncate it. */ if (smap.base + size > 0x100000000ull) size = 0x100000000ull - smap.base; if (size > high_heap_size) { high_heap_size = size; high_heap_base = smap.base; } } } while (v86.ebx != 0); /* Fall back to the old compatibility function for base memory */ if (bios_basemem == 0) { v86.ctl = 0; v86.addr = 0x12; /* int 0x12 */ v86int(); bios_basemem = (v86.eax & 0xffff) * 1024; } /* Fall back through several compatibility functions for extended memory */ if (bios_extmem == 0) { v86.ctl = V86_FLAGS; v86.addr = 0x15; /* int 0x15 function 0xe801*/ v86.eax = 0xe801; v86int(); if (!(v86.efl & 1)) { bios_extmem = ((v86.ecx & 0xffff) + ((v86.edx & 0xffff) * 64)) * 1024; } } if (bios_extmem == 0) { v86.ctl = 0; v86.addr = 0x15; /* int 0x15 function 0x88*/ v86.eax = 0x8800; v86int(); bios_extmem = (v86.eax & 0xffff) * 1024; } /* * If we have extended memory and did not find a suitable heap * region in the SMAP, use the last 3MB of 'extended' memory as a * high heap candidate. */ if (bios_extmem >= HEAP_MIN && high_heap_size < HEAP_MIN) { high_heap_size = HEAP_MIN; high_heap_base = bios_extmem + 0x100000 - HEAP_MIN; } } static int gptinit(void) { if (gptread(&freebsd_ufs_uuid, &dsk, dmadat->secbuf) == -1) { printf("%s: unable to load GPT\n", BOOTPROG); return (-1); } if (gptfind(&freebsd_ufs_uuid, &dsk, dsk.part) == -1) { printf("%s: no UFS partition was found\n", BOOTPROG); return (-1); } #ifdef LOADER_GELI_SUPPORT if (geli_taste(vdev_read, &dsk, (gpttable[curent].ent_lba_end - gpttable[curent].ent_lba_start)) == 0) { - if (geli_havekey(&dsk) != 0 && geli_passphrase(&gelipw, + if (geli_havekey(&dsk) != 0 && geli_passphrase(gelipw, dsk.unit, 'p', curent + 1, &dsk) != 0) { printf("%s: unable to decrypt GELI key\n", BOOTPROG); return (-1); } } #endif dsk_meta = 0; return (0); } + +int main(void); int main(void) { char cmd[512], cmdtmp[512]; ssize_t sz; int autoboot, dskupdated; ufs_ino_t ino; dmadat = (void *)(roundup2(__base + (int32_t)&_end, 0x10000) - __base); bios_getmem(); if (high_heap_size > 0) { heap_end = PTOV(high_heap_base + high_heap_size); heap_next = PTOV(high_heap_base); } else { heap_next = (char *)dmadat + sizeof(*dmadat); heap_end = (char *)PTOV(bios_basemem); } v86.ctl = V86_FLAGS; v86.efl = PSL_RESERVED_DEFAULT | PSL_I; dsk.drive = *(uint8_t *)PTOV(ARGS); dsk.type = dsk.drive & DRV_HARD ? TYPE_AD : TYPE_FD; dsk.unit = dsk.drive & DRV_MASK; dsk.part = -1; dsk.start = 0; bootinfo.bi_version = BOOTINFO_VERSION; bootinfo.bi_size = sizeof(bootinfo); bootinfo.bi_basemem = bios_basemem / 1024; bootinfo.bi_extmem = bios_extmem / 1024; bootinfo.bi_memsizes_valid++; bootinfo.bi_bios_dev = dsk.drive; #ifdef LOADER_GELI_SUPPORT geli_init(); #endif /* Process configuration file */ if (gptinit() != 0) return (-1); autoboot = 1; *cmd = '\0'; for (;;) { *kname = '\0'; if ((ino = lookup(PATH_CONFIG)) || (ino = lookup(PATH_DOTCONFIG))) { sz = fsread(ino, cmd, sizeof(cmd) - 1); cmd[(sz < 0) ? 0 : sz] = '\0'; } if (*cmd != '\0') { memcpy(cmdtmp, cmd, sizeof(cmdtmp)); if (parse_cmds(cmdtmp, &dskupdated)) break; if (dskupdated && gptinit() != 0) break; if (!OPT_CHECK(RBX_QUIET)) printf("%s: %s", PATH_CONFIG, cmd); *cmd = '\0'; } if (autoboot && keyhit(3)) { if (*kname == '\0') memcpy(kname, PATH_LOADER, sizeof(PATH_LOADER)); break; } autoboot = 0; /* * Try to exec stage 3 boot loader. If interrupted by a * keypress, or in case of failure, try to load a kernel * directly instead. */ if (*kname != '\0') load(); memcpy(kname, PATH_LOADER, sizeof(PATH_LOADER)); load(); memcpy(kname, PATH_KERNEL, sizeof(PATH_KERNEL)); load(); gptbootfailed(&dsk); if (gptfind(&freebsd_ufs_uuid, &dsk, -1) == -1) break; dsk_meta = 0; } /* Present the user with the boot2 prompt. */ for (;;) { if (!OPT_CHECK(RBX_QUIET)) { printf("\nFreeBSD/x86 boot\n" "Default: %u:%s(%up%u)%s\n" "boot: ", dsk.drive & DRV_MASK, dev_nm[dsk.type], dsk.unit, dsk.part, kname); } if (ioctrl & IO_SERIAL) sio_flush(); *cmd = '\0'; if (keyhit(0)) getstr(cmd, sizeof(cmd)); else if (!OPT_CHECK(RBX_QUIET)) putchar('\n'); if (parse_cmds(cmd, &dskupdated)) { putchar('\a'); continue; } if (dskupdated && gptinit() != 0) continue; load(); } /* NOTREACHED */ } /* XXX - Needed for btxld to link the boot2 binary; do not remove. */ void exit(int x) { } static void load(void) { union { struct exec ex; Elf32_Ehdr eh; } hdr; static Elf32_Phdr ep[2]; static Elf32_Shdr es[2]; caddr_t p; ufs_ino_t ino; uint32_t addr, x; int fmt, i, j; if (!(ino = lookup(kname))) { if (!ls) { printf("%s: No %s on %u:%s(%up%u)\n", BOOTPROG, kname, dsk.drive & DRV_MASK, dev_nm[dsk.type], dsk.unit, dsk.part); } return; } if (xfsread(ino, &hdr, sizeof(hdr))) return; if (N_GETMAGIC(hdr.ex) == ZMAGIC) fmt = 0; else if (IS_ELF(hdr.eh)) fmt = 1; else { printf("Invalid %s\n", "format"); return; } if (fmt == 0) { addr = hdr.ex.a_entry & 0xffffff; p = PTOV(addr); fs_off = PAGE_SIZE; if (xfsread(ino, p, hdr.ex.a_text)) return; p += roundup2(hdr.ex.a_text, PAGE_SIZE); if (xfsread(ino, p, hdr.ex.a_data)) return; p += hdr.ex.a_data + roundup2(hdr.ex.a_bss, PAGE_SIZE); bootinfo.bi_symtab = VTOP(p); memcpy(p, &hdr.ex.a_syms, sizeof(hdr.ex.a_syms)); p += sizeof(hdr.ex.a_syms); if (hdr.ex.a_syms) { if (xfsread(ino, p, hdr.ex.a_syms)) return; p += hdr.ex.a_syms; if (xfsread(ino, p, sizeof(int))) return; x = *(uint32_t *)p; p += sizeof(int); x -= sizeof(int); if (xfsread(ino, p, x)) return; p += x; } } else { fs_off = hdr.eh.e_phoff; for (j = i = 0; i < hdr.eh.e_phnum && j < 2; i++) { if (xfsread(ino, ep + j, sizeof(ep[0]))) return; if (ep[j].p_type == PT_LOAD) j++; } for (i = 0; i < 2; i++) { p = PTOV(ep[i].p_paddr & 0xffffff); fs_off = ep[i].p_offset; if (xfsread(ino, p, ep[i].p_filesz)) return; } p += roundup2(ep[1].p_memsz, PAGE_SIZE); bootinfo.bi_symtab = VTOP(p); if (hdr.eh.e_shnum == hdr.eh.e_shstrndx + 3) { fs_off = hdr.eh.e_shoff + sizeof(es[0]) * (hdr.eh.e_shstrndx + 1); if (xfsread(ino, &es, sizeof(es))) return; for (i = 0; i < 2; i++) { memcpy(p, &es[i].sh_size, sizeof(es[i].sh_size)); p += sizeof(es[i].sh_size); fs_off = es[i].sh_offset; if (xfsread(ino, p, es[i].sh_size)) return; p += es[i].sh_size; } } addr = hdr.eh.e_entry & 0xffffff; } bootinfo.bi_esymtab = VTOP(p); bootinfo.bi_kernelname = VTOP(kname); bootinfo.bi_bios_dev = dsk.drive; #ifdef LOADER_GELI_SUPPORT geliargs.size = sizeof(geliargs); explicit_bzero(gelipw, sizeof(gelipw)); gelibuf = malloc(sizeof(struct keybuf) + (GELI_MAX_KEYS * sizeof(struct keybuf_ent))); geli_fill_keybuf(gelibuf); geliargs.notapw = '\0'; geliargs.keybuf_sentinel = KEYBUF_SENTINEL; geliargs.keybuf = gelibuf; #endif __exec((caddr_t)addr, RB_BOOTINFO | (opts & RBX_MASK), MAKEBOOTDEV(dev_maj[dsk.type], dsk.part + 1, dsk.unit, 0xff), KARGS_FLAGS_EXTARG, 0, 0, VTOP(&bootinfo) #ifdef LOADER_GELI_SUPPORT , geliargs #endif ); } static int parse_cmds(char *cmdstr, int *dskupdated) { char *arg = cmdstr; char *ep, *p, *q; const char *cp; unsigned int drv; int c, i, j; *dskupdated = 0; while ((c = *arg++)) { if (c == ' ' || c == '\t' || c == '\n') continue; for (p = arg; *p && *p != '\n' && *p != ' ' && *p != '\t'; p++); ep = p; if (*p) *p++ = 0; if (c == '-') { while ((c = *arg++)) { if (c == 'P') { if (*(uint8_t *)PTOV(0x496) & 0x10) { cp = "yes"; } else { opts |= OPT_SET(RBX_DUAL) | OPT_SET(RBX_SERIAL); cp = "no"; } printf("Keyboard: %s\n", cp); continue; } else if (c == 'S') { j = 0; while ((unsigned int)(i = *arg++ - '0') <= 9) j = j * 10 + i; if (j > 0 && i == -'0') { comspeed = j; break; } /* Fall through to error below ('S' not in optstr[]). */ } for (i = 0; c != optstr[i]; i++) if (i == NOPT - 1) return -1; opts ^= OPT_SET(flags[i]); } ioctrl = OPT_CHECK(RBX_DUAL) ? (IO_SERIAL|IO_KEYBOARD) : OPT_CHECK(RBX_SERIAL) ? IO_SERIAL : IO_KEYBOARD; if (ioctrl & IO_SERIAL) { if (sio_init(115200 / comspeed) != 0) ioctrl &= ~IO_SERIAL; } } else { for (q = arg--; *q && *q != '('; q++); if (*q) { drv = -1; if (arg[1] == ':') { drv = *arg - '0'; if (drv > 9) return (-1); arg += 2; } if (q - arg != 2) return -1; for (i = 0; arg[0] != dev_nm[i][0] || arg[1] != dev_nm[i][1]; i++) if (i == NDEV - 1) return -1; dsk.type = i; arg += 3; dsk.unit = *arg - '0'; if (arg[1] != 'p' || dsk.unit > 9) return -1; arg += 2; dsk.part = *arg - '0'; if (dsk.part < 1 || dsk.part > 9) return -1; arg++; if (arg[0] != ')') return -1; arg++; if (drv == -1) drv = dsk.unit; dsk.drive = (dsk.type <= TYPE_MAXHARD ? DRV_HARD : 0) + drv; *dskupdated = 1; } if ((i = ep - arg)) { if ((size_t)i >= sizeof(kname)) return -1; memcpy(kname, arg, i + 1); } } arg = p; } return 0; } static int dskread(void *buf, daddr_t lba, unsigned nblk) { int err; err = drvread(&dsk, buf, lba + dsk.start, nblk); #ifdef LOADER_GELI_SUPPORT if (err == 0 && is_geli(&dsk) == 0) { /* Decrypt */ if (geli_read(&dsk, lba * DEV_BSIZE, buf, nblk * DEV_BSIZE)) return (err); } #endif return (err); } #ifdef LOADER_GELI_SUPPORT /* * Read function compartible with the ZFS callback, required to keep the GELI * Implementation the same for both UFS and ZFS */ static int vdev_read(void *vdev __unused, void *priv, off_t off, void *buf, size_t bytes) { char *p; daddr_t lba; unsigned int nb; struct dsk *dskp = (struct dsk *) priv; if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1))) return (-1); p = buf; lba = off / DEV_BSIZE; lba += dskp->start; while (bytes > 0) { nb = bytes / DEV_BSIZE; if (nb > VBLKSIZE / DEV_BSIZE) nb = VBLKSIZE / DEV_BSIZE; if (drvread(dskp, dmadat->blkbuf, lba, nb)) return (-1); memcpy(p, dmadat->blkbuf, nb * DEV_BSIZE); p += nb * DEV_BSIZE; lba += nb; bytes -= nb * DEV_BSIZE; } return (0); } #endif /* LOADER_GELI_SUPPORT */ Index: head/stand/i386/libi386/biosdisk.c =================================================================== --- head/stand/i386/libi386/biosdisk.c (revision 326447) +++ head/stand/i386/libi386/biosdisk.c (revision 326448) @@ -1,1013 +1,1013 @@ /*- * Copyright (c) 1998 Michael Smith * Copyright (c) 2012 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * BIOS disk device handling. * * Ideas and algorithms from: * * - NetBSD libi386/biosdisk.c * - FreeBSD biosboot/disk.c * */ #include #include #include #include #include #include #include #include #include "disk.h" #include "libi386.h" #ifdef LOADER_GELI_SUPPORT #include "cons.h" #include "drv.h" #include "gpt.h" #include "part.h" #include struct pentry { struct ptable_entry part; uint64_t flags; union { uint8_t bsd; uint8_t mbr; uuid_t gpt; uint16_t vtoc8; } type; STAILQ_ENTRY(pentry) entry; }; struct ptable { enum ptable_type type; uint16_t sectorsize; uint64_t sectors; STAILQ_HEAD(, pentry) entries; }; #include "geliboot.c" #endif /* LOADER_GELI_SUPPORT */ CTASSERT(sizeof(struct i386_devdesc) >= sizeof(struct disk_devdesc)); #define BIOS_NUMDRIVES 0x475 #define BIOSDISK_SECSIZE 512 #define BUFSIZE (1 * BIOSDISK_SECSIZE) #define DT_ATAPI 0x10 /* disk type for ATAPI floppies */ #define WDMAJOR 0 /* major numbers for devices we frontend for */ #define WFDMAJOR 1 #define FDMAJOR 2 #define DAMAJOR 4 #ifdef DISK_DEBUG # define DEBUG(fmt, args...) printf("%s: " fmt "\n" , __func__ , ## args) #else # define DEBUG(fmt, args...) #endif /* * List of BIOS devices, translation from disk unit number to * BIOS unit number. */ static struct bdinfo { int bd_unit; /* BIOS unit number */ int bd_cyl; /* BIOS geometry */ int bd_hds; int bd_sec; int bd_flags; #define BD_MODEINT13 0x0000 #define BD_MODEEDD1 0x0001 #define BD_MODEEDD3 0x0002 #define BD_MODEMASK 0x0003 #define BD_FLOPPY 0x0004 int bd_type; /* BIOS 'drive type' (floppy only) */ uint16_t bd_sectorsize; /* Sector size */ uint64_t bd_sectors; /* Disk size */ int bd_open; /* reference counter */ void *bd_bcache; /* buffer cache data */ } bdinfo [MAXBDDEV]; static int nbdinfo = 0; #define BD(dev) (bdinfo[(dev)->d_unit]) static int bd_read(struct disk_devdesc *dev, daddr_t dblk, int blks, caddr_t dest); static int bd_write(struct disk_devdesc *dev, daddr_t dblk, int blks, caddr_t dest); static int bd_int13probe(struct bdinfo *bd); static int bd_init(void); static int bd_strategy(void *devdata, int flag, daddr_t dblk, size_t size, char *buf, size_t *rsize); static int bd_realstrategy(void *devdata, int flag, daddr_t dblk, size_t size, char *buf, size_t *rsize); static int bd_open(struct open_file *f, ...); static int bd_close(struct open_file *f); static int bd_ioctl(struct open_file *f, u_long cmd, void *data); static int bd_print(int verbose); #ifdef LOADER_GELI_SUPPORT enum isgeli { ISGELI_UNKNOWN, ISGELI_NO, ISGELI_YES }; static enum isgeli geli_status[MAXBDDEV][MAXTBLENTS]; -int bios_read(void *vdev __unused, struct dsk *priv, off_t off, char *buf, - size_t bytes); +int bios_read(void *, void *, off_t off, void *buf, size_t bytes); #endif /* LOADER_GELI_SUPPORT */ struct devsw biosdisk = { "disk", DEVT_DISK, bd_init, bd_strategy, bd_open, bd_close, bd_ioctl, bd_print, NULL }; /* * Translate between BIOS device numbers and our private unit numbers. */ int bd_bios2unit(int biosdev) { int i; DEBUG("looking for bios device 0x%x", biosdev); for (i = 0; i < nbdinfo; i++) { DEBUG("bd unit %d is BIOS device 0x%x", i, bdinfo[i].bd_unit); if (bdinfo[i].bd_unit == biosdev) return (i); } return (-1); } int bd_unit2bios(int unit) { if ((unit >= 0) && (unit < nbdinfo)) return (bdinfo[unit].bd_unit); return (-1); } /* * Quiz the BIOS for disk devices, save a little info about them. */ static int bd_init(void) { int base, unit, nfd = 0; #ifdef LOADER_GELI_SUPPORT geli_init(); #endif /* sequence 0, 0x80 */ for (base = 0; base <= 0x80; base += 0x80) { for (unit = base; (nbdinfo < MAXBDDEV); unit++) { #ifndef VIRTUALBOX /* * Check the BIOS equipment list for number * of fixed disks. */ if(base == 0x80 && (nfd >= *(unsigned char *)PTOV(BIOS_NUMDRIVES))) break; #endif bdinfo[nbdinfo].bd_open = 0; bdinfo[nbdinfo].bd_bcache = NULL; bdinfo[nbdinfo].bd_unit = unit; bdinfo[nbdinfo].bd_flags = unit < 0x80 ? BD_FLOPPY: 0; if (!bd_int13probe(&bdinfo[nbdinfo])) break; /* XXX we need "disk aliases" to make this simpler */ printf("BIOS drive %c: is disk%d\n", (unit < 0x80) ? ('A' + unit): ('C' + unit - 0x80), nbdinfo); nbdinfo++; if (base == 0x80) nfd++; } } bcache_add_dev(nbdinfo); return(0); } /* * Try to detect a device supported by the legacy int13 BIOS */ static int bd_int13probe(struct bdinfo *bd) { struct edd_params params; int ret = 1; /* assume success */ v86.ctl = V86_FLAGS; v86.addr = 0x13; v86.eax = 0x800; v86.edx = bd->bd_unit; v86int(); /* Don't error out if we get bad sector number, try EDD as well */ if (V86_CY(v86.efl) || /* carry set */ (v86.edx & 0xff) <= (unsigned)(bd->bd_unit & 0x7f)) /* unit # bad */ return (0); /* skip device */ if ((v86.ecx & 0x3f) == 0) /* absurd sector number */ ret = 0; /* set error */ /* Convert max cyl # -> # of cylinders */ bd->bd_cyl = ((v86.ecx & 0xc0) << 2) + ((v86.ecx & 0xff00) >> 8) + 1; /* Convert max head # -> # of heads */ bd->bd_hds = ((v86.edx & 0xff00) >> 8) + 1; bd->bd_sec = v86.ecx & 0x3f; bd->bd_type = v86.ebx & 0xff; bd->bd_flags |= BD_MODEINT13; /* Calculate sectors count from the geometry */ bd->bd_sectors = bd->bd_cyl * bd->bd_hds * bd->bd_sec; bd->bd_sectorsize = BIOSDISK_SECSIZE; DEBUG("unit 0x%x geometry %d/%d/%d", bd->bd_unit, bd->bd_cyl, bd->bd_hds, bd->bd_sec); /* Determine if we can use EDD with this device. */ v86.ctl = V86_FLAGS; v86.addr = 0x13; v86.eax = 0x4100; v86.edx = bd->bd_unit; v86.ebx = 0x55aa; v86int(); if (V86_CY(v86.efl) || /* carry set */ (v86.ebx & 0xffff) != 0xaa55 || /* signature */ (v86.ecx & EDD_INTERFACE_FIXED_DISK) == 0) return (ret); /* return code from int13 AH=08 */ /* EDD supported */ bd->bd_flags |= BD_MODEEDD1; if ((v86.eax & 0xff00) >= 0x3000) bd->bd_flags |= BD_MODEEDD3; /* Get disk params */ params.len = sizeof(struct edd_params); v86.ctl = V86_FLAGS; v86.addr = 0x13; v86.eax = 0x4800; v86.edx = bd->bd_unit; v86.ds = VTOPSEG(¶ms); v86.esi = VTOPOFF(¶ms); v86int(); if (!V86_CY(v86.efl)) { uint64_t total; /* * Sector size must be a multiple of 512 bytes. * An alternate test would be to check power of 2, * powerof2(params.sector_size). */ if (params.sector_size % BIOSDISK_SECSIZE) bd->bd_sectorsize = BIOSDISK_SECSIZE; else bd->bd_sectorsize = params.sector_size; total = bd->bd_sectorsize * params.sectors; if (params.sectors != 0) { /* Only update if we did not overflow. */ if (total > params.sectors) bd->bd_sectors = params.sectors; } total = (uint64_t)params.cylinders * params.heads * params.sectors_per_track; if (bd->bd_sectors < total) bd->bd_sectors = total; ret = 1; } DEBUG("unit 0x%x flags %x, sectors %llu, sectorsize %u", bd->bd_unit, bd->bd_flags, bd->bd_sectors, bd->bd_sectorsize); return (ret); } /* * Print information about disks */ static int bd_print(int verbose) { static char line[80]; struct disk_devdesc dev; int i, ret = 0; if (nbdinfo == 0) return (0); printf("%s devices:", biosdisk.dv_name); if ((ret = pager_output("\n")) != 0) return (ret); for (i = 0; i < nbdinfo; i++) { snprintf(line, sizeof(line), " disk%d: BIOS drive %c (%ju X %u):\n", i, (bdinfo[i].bd_unit < 0x80) ? ('A' + bdinfo[i].bd_unit): ('C' + bdinfo[i].bd_unit - 0x80), (uintmax_t)bdinfo[i].bd_sectors, bdinfo[i].bd_sectorsize); if ((ret = pager_output(line)) != 0) break; dev.d_dev = &biosdisk; dev.d_unit = i; dev.d_slice = -1; dev.d_partition = -1; if (disk_open(&dev, bdinfo[i].bd_sectorsize * bdinfo[i].bd_sectors, bdinfo[i].bd_sectorsize) == 0) { snprintf(line, sizeof(line), " disk%d", i); ret = disk_print(&dev, line, verbose); disk_close(&dev); if (ret != 0) return (ret); } } return (ret); } /* * Attempt to open the disk described by (dev) for use by (f). * * Note that the philosophy here is "give them exactly what * they ask for". This is necessary because being too "smart" * about what the user might want leads to complications. * (eg. given no slice or partition value, with a disk that is * sliced - are they after the first BSD slice, or the DOS * slice before it?) */ static int bd_open(struct open_file *f, ...) { struct disk_devdesc *dev, rdev; struct disk_devdesc disk; int err, g_err; va_list ap; uint64_t size; va_start(ap, f); dev = va_arg(ap, struct disk_devdesc *); va_end(ap); if (dev->d_unit < 0 || dev->d_unit >= nbdinfo) return (EIO); BD(dev).bd_open++; if (BD(dev).bd_bcache == NULL) BD(dev).bd_bcache = bcache_allocate(); /* * Read disk size from partition. * This is needed to work around buggy BIOS systems returning * wrong (truncated) disk media size. * During bd_probe() we tested if the mulitplication of bd_sectors * would overflow so it should be safe to perform here. */ disk.d_dev = dev->d_dev; disk.d_type = dev->d_type; disk.d_unit = dev->d_unit; disk.d_opendata = NULL; disk.d_slice = -1; disk.d_partition = -1; disk.d_offset = 0; if (disk_open(&disk, BD(dev).bd_sectors * BD(dev).bd_sectorsize, BD(dev).bd_sectorsize) == 0) { if (disk_ioctl(&disk, DIOCGMEDIASIZE, &size) == 0) { size /= BD(dev).bd_sectorsize; if (size > BD(dev).bd_sectors) BD(dev).bd_sectors = size; } disk_close(&disk); } err = disk_open(dev, BD(dev).bd_sectors * BD(dev).bd_sectorsize, BD(dev).bd_sectorsize); #ifdef LOADER_GELI_SUPPORT static char gelipw[GELI_PW_MAXLEN]; char *passphrase; if (err) return (err); /* if we already know there is no GELI, skip the rest */ if (geli_status[dev->d_unit][dev->d_slice] != ISGELI_UNKNOWN) return (err); struct dsk dskp; struct ptable *table = NULL; struct ptable_entry part; struct pentry *entry; int geli_part = 0; dskp.drive = bd_unit2bios(dev->d_unit); dskp.type = dev->d_type; dskp.unit = dev->d_unit; dskp.slice = dev->d_slice; dskp.part = dev->d_partition; dskp.start = dev->d_offset; memcpy(&rdev, dev, sizeof(rdev)); /* to read the GPT table, we need to read the first sector */ rdev.d_offset = 0; /* We need the LBA of the end of the partition */ table = ptable_open(&rdev, BD(dev).bd_sectors, BD(dev).bd_sectorsize, ptblread); if (table == NULL) { DEBUG("Can't read partition table"); /* soft failure, return the exit status of disk_open */ return (err); } if (table->type == PTABLE_GPT) dskp.part = 255; STAILQ_FOREACH(entry, &table->entries, entry) { dskp.slice = entry->part.index; dskp.start = entry->part.start; if (is_geli(&dskp) == 0) { geli_status[dev->d_unit][dskp.slice] = ISGELI_YES; return (0); } if (geli_taste(bios_read, &dskp, entry->part.end - entry->part.start) == 0) { if (geli_havekey(&dskp) == 0) { geli_status[dev->d_unit][dskp.slice] = ISGELI_YES; geli_part++; continue; } if ((passphrase = getenv("kern.geom.eli.passphrase")) != NULL) { /* Use the cached passphrase */ bcopy(passphrase, &gelipw, GELI_PW_MAXLEN); } - if (geli_passphrase(&gelipw, dskp.unit, 'p', + if (geli_passphrase(gelipw, dskp.unit, 'p', (dskp.slice > 0 ? dskp.slice : dskp.part), &dskp) == 0) { - setenv("kern.geom.eli.passphrase", &gelipw, 1); + setenv("kern.geom.eli.passphrase", gelipw, 1); bzero(gelipw, sizeof(gelipw)); geli_status[dev->d_unit][dskp.slice] = ISGELI_YES; geli_part++; continue; } } else geli_status[dev->d_unit][dskp.slice] = ISGELI_NO; } /* none of the partitions on this disk have GELI */ if (geli_part == 0) { /* found no GELI */ geli_status[dev->d_unit][dev->d_slice] = ISGELI_NO; } #endif /* LOADER_GELI_SUPPORT */ return (err); } static int bd_close(struct open_file *f) { struct disk_devdesc *dev; dev = (struct disk_devdesc *)f->f_devdata; BD(dev).bd_open--; if (BD(dev).bd_open == 0) { bcache_free(BD(dev).bd_bcache); BD(dev).bd_bcache = NULL; } return (disk_close(dev)); } static int bd_ioctl(struct open_file *f, u_long cmd, void *data) { struct disk_devdesc *dev; int rc; dev = (struct disk_devdesc *)f->f_devdata; rc = disk_ioctl(dev, cmd, data); if (rc != ENOTTY) return (rc); switch (cmd) { case DIOCGSECTORSIZE: *(u_int *)data = BD(dev).bd_sectorsize; break; case DIOCGMEDIASIZE: *(uint64_t *)data = BD(dev).bd_sectors * BD(dev).bd_sectorsize; break; default: return (ENOTTY); } return (0); } static int bd_strategy(void *devdata, int rw, daddr_t dblk, size_t size, char *buf, size_t *rsize) { struct bcache_devdata bcd; struct disk_devdesc *dev; dev = (struct disk_devdesc *)devdata; bcd.dv_strategy = bd_realstrategy; bcd.dv_devdata = devdata; bcd.dv_cache = BD(dev).bd_bcache; return (bcache_strategy(&bcd, rw, dblk + dev->d_offset, size, buf, rsize)); } static int bd_realstrategy(void *devdata, int rw, daddr_t dblk, size_t size, char *buf, size_t *rsize) { struct disk_devdesc *dev = (struct disk_devdesc *)devdata; uint64_t disk_blocks; int blks, rc; #ifdef BD_SUPPORT_FRAGS /* XXX: sector size */ char fragbuf[BIOSDISK_SECSIZE]; size_t fragsize; fragsize = size % BIOSDISK_SECSIZE; #else if (size % BD(dev).bd_sectorsize) panic("bd_strategy: %d bytes I/O not multiple of block size", size); #endif DEBUG("open_disk %p", dev); /* * Check the value of the size argument. We do have quite small * heap (64MB), but we do not know good upper limit, so we check against * INT_MAX here. This will also protect us against possible overflows * while translating block count to bytes. */ if (size > INT_MAX) { DEBUG("too large read: %zu bytes", size); return (EIO); } blks = size / BD(dev).bd_sectorsize; if (dblk > dblk + blks) return (EIO); if (rsize) *rsize = 0; /* Get disk blocks, this value is either for whole disk or for partition */ if (disk_ioctl(dev, DIOCGMEDIASIZE, &disk_blocks)) { /* DIOCGMEDIASIZE does return bytes. */ disk_blocks /= BD(dev).bd_sectorsize; } else { /* We should not get here. Just try to survive. */ disk_blocks = BD(dev).bd_sectors - dev->d_offset; } /* Validate source block address. */ if (dblk < dev->d_offset || dblk >= dev->d_offset + disk_blocks) return (EIO); /* * Truncate if we are crossing disk or partition end. */ if (dblk + blks >= dev->d_offset + disk_blocks) { blks = dev->d_offset + disk_blocks - dblk; size = blks * BD(dev).bd_sectorsize; DEBUG("short read %d", blks); } switch (rw & F_MASK) { case F_READ: DEBUG("read %d from %lld to %p", blks, dblk, buf); if (blks && (rc = bd_read(dev, dblk, blks, buf))) { /* Filter out floppy controller errors */ if (BD(dev).bd_flags != BD_FLOPPY || rc != 0x20) { printf("read %d from %lld to %p, error: 0x%x", blks, dblk, buf, rc); } return (EIO); } #ifdef BD_SUPPORT_FRAGS /* XXX: sector size */ DEBUG("bd_strategy: frag read %d from %d+%d to %p", fragsize, dblk, blks, buf + (blks * BIOSDISK_SECSIZE)); if (fragsize && bd_read(od, dblk + blks, 1, fragsize)) { DEBUG("frag read error"); return(EIO); } bcopy(fragbuf, buf + (blks * BIOSDISK_SECSIZE), fragsize); #endif break; case F_WRITE : DEBUG("write %d from %d to %p", blks, dblk, buf); if (blks && bd_write(dev, dblk, blks, buf)) { DEBUG("write error"); return (EIO); } #ifdef BD_SUPPORT_FRAGS if(fragsize) { DEBUG("Attempted to write a frag"); return (EIO); } #endif break; default: /* DO NOTHING */ return (EROFS); } if (rsize) *rsize = size; return (0); } static int bd_edd_io(struct disk_devdesc *dev, daddr_t dblk, int blks, caddr_t dest, int write) { static struct edd_packet packet; packet.len = sizeof(struct edd_packet); packet.count = blks; packet.off = VTOPOFF(dest); packet.seg = VTOPSEG(dest); packet.lba = dblk; v86.ctl = V86_FLAGS; v86.addr = 0x13; if (write) /* Should we Write with verify ?? 0x4302 ? */ v86.eax = 0x4300; else v86.eax = 0x4200; v86.edx = BD(dev).bd_unit; v86.ds = VTOPSEG(&packet); v86.esi = VTOPOFF(&packet); v86int(); if (V86_CY(v86.efl)) return (v86.eax >> 8); return (0); } static int bd_chs_io(struct disk_devdesc *dev, daddr_t dblk, int blks, caddr_t dest, int write) { u_int x, bpc, cyl, hd, sec; bpc = BD(dev).bd_sec * BD(dev).bd_hds; /* blocks per cylinder */ x = dblk; cyl = x / bpc; /* block # / blocks per cylinder */ x %= bpc; /* block offset into cylinder */ hd = x / BD(dev).bd_sec; /* offset / blocks per track */ sec = x % BD(dev).bd_sec; /* offset into track */ /* correct sector number for 1-based BIOS numbering */ sec++; if (cyl > 1023) /* CHS doesn't support cylinders > 1023. */ return (1); v86.ctl = V86_FLAGS; v86.addr = 0x13; if (write) v86.eax = 0x300 | blks; else v86.eax = 0x200 | blks; v86.ecx = ((cyl & 0xff) << 8) | ((cyl & 0x300) >> 2) | sec; v86.edx = (hd << 8) | BD(dev).bd_unit; v86.es = VTOPSEG(dest); v86.ebx = VTOPOFF(dest); v86int(); if (V86_CY(v86.efl)) return (v86.eax >> 8); return (0); } static int bd_io(struct disk_devdesc *dev, daddr_t dblk, int blks, caddr_t dest, int write) { u_int x, sec, result, resid, retry, maxfer; caddr_t p, xp, bbuf; /* Just in case some idiot actually tries to read/write -1 blocks... */ if (blks < 0) return (-1); resid = blks; p = dest; /* Decide whether we have to bounce */ if (VTOP(dest) >> 20 != 0 || (BD(dev).bd_unit < 0x80 && (VTOP(dest) >> 16) != (VTOP(dest + blks * BD(dev).bd_sectorsize) >> 16))) { /* * There is a 64k physical boundary somewhere in the * destination buffer, or the destination buffer is above * first 1MB of physical memory so we have to arrange a * suitable bounce buffer. Allocate a buffer twice as large * as we need to. Use the bottom half unless there is a break * there, in which case we use the top half. */ x = V86_IO_BUFFER_SIZE / BD(dev).bd_sectorsize; x = min(x, (unsigned)blks); bbuf = PTOV(V86_IO_BUFFER); maxfer = x; /* limit transfers to bounce region size */ } else { bbuf = NULL; maxfer = 0; } while (resid > 0) { /* * Play it safe and don't cross track boundaries. * (XXX this is probably unnecessary) */ sec = dblk % BD(dev).bd_sec; /* offset into track */ x = min(BD(dev).bd_sec - sec, resid); if (maxfer > 0) x = min(x, maxfer); /* fit bounce buffer */ /* where do we transfer to? */ xp = bbuf == NULL ? p : bbuf; /* * Put your Data In, Put your Data out, * Put your Data In, and shake it all about */ if (write && bbuf != NULL) bcopy(p, bbuf, x * BD(dev).bd_sectorsize); /* * Loop retrying the operation a couple of times. The BIOS * may also retry. */ for (retry = 0; retry < 3; retry++) { /* if retrying, reset the drive */ if (retry > 0) { v86.ctl = V86_FLAGS; v86.addr = 0x13; v86.eax = 0; v86.edx = BD(dev).bd_unit; v86int(); } if (BD(dev).bd_flags & BD_MODEEDD1) result = bd_edd_io(dev, dblk, x, xp, write); else result = bd_chs_io(dev, dblk, x, xp, write); if (result == 0) break; } if (write) DEBUG("Write %d sector(s) from %p (0x%x) to %lld %s", x, p, VTOP(p), dblk, result ? "failed" : "ok"); else DEBUG("Read %d sector(s) from %lld to %p (0x%x) %s", x, dblk, p, VTOP(p), result ? "failed" : "ok"); if (result) { return (result); } if (!write && bbuf != NULL) bcopy(bbuf, p, x * BD(dev).bd_sectorsize); p += (x * BD(dev).bd_sectorsize); dblk += x; resid -= x; } /* hexdump(dest, (blks * BD(dev).bd_sectorsize)); */ return(0); } static int bd_read(struct disk_devdesc *dev, daddr_t dblk, int blks, caddr_t dest) { #ifdef LOADER_GELI_SUPPORT struct dsk dskp; off_t p_off, diff; daddr_t alignlba; int err, n, alignblks; char *tmpbuf; /* if we already know there is no GELI, skip the rest */ if (geli_status[dev->d_unit][dev->d_slice] != ISGELI_YES) return (bd_io(dev, dblk, blks, dest, 0)); if (geli_status[dev->d_unit][dev->d_slice] == ISGELI_YES) { /* * Align reads to DEV_GELIBOOT_BSIZE bytes because partial * sectors cannot be decrypted. Round the requested LBA down to * nearest multiple of DEV_GELIBOOT_BSIZE bytes. */ alignlba = rounddown2(dblk * BD(dev).bd_sectorsize, DEV_GELIBOOT_BSIZE) / BD(dev).bd_sectorsize; /* * Round number of blocks to read up to nearest multiple of * DEV_GELIBOOT_BSIZE */ diff = (dblk - alignlba) * BD(dev).bd_sectorsize; alignblks = roundup2(blks * BD(dev).bd_sectorsize + diff, DEV_GELIBOOT_BSIZE) / BD(dev).bd_sectorsize; /* * If the read is rounded up to a larger size, use a temporary * buffer here because the buffer provided by the caller may be * too small. */ if (diff == 0) { tmpbuf = dest; } else { tmpbuf = malloc(alignblks * BD(dev).bd_sectorsize); if (tmpbuf == NULL) { return (-1); } } err = bd_io(dev, alignlba, alignblks, tmpbuf, 0); if (err) return (err); dskp.drive = bd_unit2bios(dev->d_unit); dskp.type = dev->d_type; dskp.unit = dev->d_unit; dskp.slice = dev->d_slice; dskp.part = dev->d_partition; dskp.start = dev->d_offset; /* GELI needs the offset relative to the partition start */ p_off = alignlba - dskp.start; - err = geli_read(&dskp, p_off * BD(dev).bd_sectorsize, tmpbuf, + err = geli_read(&dskp, p_off * BD(dev).bd_sectorsize, (u_char *)tmpbuf, alignblks * BD(dev).bd_sectorsize); if (err) return (err); if (tmpbuf != dest) { bcopy(tmpbuf + diff, dest, blks * BD(dev).bd_sectorsize); free(tmpbuf); } return (0); } #endif /* LOADER_GELI_SUPPORT */ return (bd_io(dev, dblk, blks, dest, 0)); } static int bd_write(struct disk_devdesc *dev, daddr_t dblk, int blks, caddr_t dest) { return (bd_io(dev, dblk, blks, dest, 1)); } /* * Return the BIOS geometry of a given "fixed drive" in a format * suitable for the legacy bootinfo structure. Since the kernel is * expecting raw int 0x13/0x8 values for N_BIOS_GEOM drives, we * prefer to get the information directly, rather than rely on being * able to put it together from information already maintained for * different purposes and for a probably different number of drives. * * For valid drives, the geometry is expected in the format (31..0) * "000000cc cccccccc hhhhhhhh 00ssssss"; and invalid drives are * indicated by returning the geometry of a "1.2M" PC-format floppy * disk. And, incidentally, what is returned is not the geometry as * such but the highest valid cylinder, head, and sector numbers. */ u_int32_t bd_getbigeom(int bunit) { v86.ctl = V86_FLAGS; v86.addr = 0x13; v86.eax = 0x800; v86.edx = 0x80 + bunit; v86int(); if (V86_CY(v86.efl)) return 0x4f010f; return ((v86.ecx & 0xc0) << 18) | ((v86.ecx & 0xff00) << 8) | (v86.edx & 0xff00) | (v86.ecx & 0x3f); } /* * Return a suitable dev_t value for (dev). * * In the case where it looks like (dev) is a SCSI disk, we allow the number of * IDE disks to be specified in $num_ide_disks. There should be a Better Way. */ int bd_getdev(struct i386_devdesc *d) { struct disk_devdesc *dev; int biosdev; int major; int rootdev; char *nip, *cp; int i, unit; dev = (struct disk_devdesc *)d; biosdev = bd_unit2bios(dev->d_unit); DEBUG("unit %d BIOS device %d", dev->d_unit, biosdev); if (biosdev == -1) /* not a BIOS device */ return(-1); if (disk_open(dev, BD(dev).bd_sectors * BD(dev).bd_sectorsize, BD(dev).bd_sectorsize) != 0) /* oops, not a viable device */ return (-1); else disk_close(dev); if (biosdev < 0x80) { /* floppy (or emulated floppy) or ATAPI device */ if (bdinfo[dev->d_unit].bd_type == DT_ATAPI) { /* is an ATAPI disk */ major = WFDMAJOR; } else { /* is a floppy disk */ major = FDMAJOR; } } else { /* assume an IDE disk */ major = WDMAJOR; } /* default root disk unit number */ unit = biosdev & 0x7f; /* XXX a better kludge to set the root disk unit number */ if ((nip = getenv("root_disk_unit")) != NULL) { i = strtol(nip, &cp, 0); /* check for parse error */ if ((cp != nip) && (*cp == 0)) unit = i; } rootdev = MAKEBOOTDEV(major, dev->d_slice + 1, unit, dev->d_partition); DEBUG("dev is 0x%x\n", rootdev); return(rootdev); } #ifdef LOADER_GELI_SUPPORT int -bios_read(void *vdev __unused, struct dsk *priv, off_t off, char *buf, size_t bytes) +bios_read(void *vdev __unused, void *xpriv, off_t off, void *buf, size_t bytes) { struct disk_devdesc dev; + struct dsk *priv = xpriv; dev.d_dev = &biosdisk; dev.d_type = priv->type; dev.d_unit = priv->unit; dev.d_slice = priv->slice; dev.d_partition = priv->part; dev.d_offset = priv->start; off = off / BD(&dev).bd_sectorsize; /* GELI gives us the offset relative to the partition start */ off += dev.d_offset; bytes = bytes / BD(&dev).bd_sectorsize; return (bd_io(&dev, off, bytes, buf, 0)); } #endif /* LOADER_GELI_SUPPORT */ Index: head/stand/i386/zfsboot/zfsboot.c =================================================================== --- head/stand/i386/zfsboot/zfsboot.c (revision 326447) +++ head/stand/i386/zfsboot/zfsboot.c (revision 326448) @@ -1,1151 +1,1161 @@ /*- * Copyright (c) 1998 Robert Nordier * All rights reserved. * * Redistribution and use in source and binary forms are freely * permitted provided that the above copyright notice and this * paragraph and the following disclaimer are duplicated in all * such forms. * * This software is provided "AS IS" and without any express or * implied warranties, including, without limitation, the implied * warranties of merchantability and fitness for a particular * purpose. */ #include __FBSDID("$FreeBSD$"); #include #include #include #ifdef GPT #include #endif #include #include #include #include #include #include #include #include #include +/* Forward declared to avoid warnings -- these shouldn't be needed */ +int strcasecmp(const char *s1, const char *s2); +void explicit_bzero(void *b, size_t len); + #include "lib.h" #include "rbx.h" #include "drv.h" #include "edd.h" #include "util.h" #include "cons.h" #include "bootargs.h" #include "paths.h" #include "libzfs.h" #define ARGS 0x900 #define NOPT 14 #define NDEV 3 #define BIOS_NUMDRIVES 0x475 #define DRV_HARD 0x80 #define DRV_MASK 0x7f #define TYPE_AD 0 #define TYPE_DA 1 #define TYPE_MAXHARD TYPE_DA #define TYPE_FD 2 #define DEV_GELIBOOT_BSIZE 4096 extern uint32_t _end; #ifdef GPT static const uuid_t freebsd_zfs_uuid = GPT_ENT_TYPE_FREEBSD_ZFS; #endif static const char optstr[NOPT] = "DhaCcdgmnpqrsv"; /* Also 'P', 'S' */ static const unsigned char flags[NOPT] = { RBX_DUAL, RBX_SERIAL, RBX_ASKNAME, RBX_CDROM, RBX_CONFIG, RBX_KDB, RBX_GDB, RBX_MUTE, RBX_NOINTR, RBX_PAUSE, RBX_QUIET, RBX_DFLTROOT, RBX_SINGLE, RBX_VERBOSE }; uint32_t opts; static const unsigned char dev_maj[NDEV] = {30, 4, 2}; static char cmd[512]; static char cmddup[512]; static char kname[1024]; static char rootname[256]; static int comspeed = SIOSPD; static struct bootinfo bootinfo; static uint32_t bootdev; static struct zfs_boot_args zfsargs; -static struct zfsmount zfsmount; vm_offset_t high_heap_base; uint32_t bios_basemem, bios_extmem, high_heap_size; static struct bios_smap smap; /* * The minimum amount of memory to reserve in bios_extmem for the heap. */ #define HEAP_MIN (64 * 1024 * 1024) static char *heap_next; static char *heap_end; /* Buffers that must not span a 64k boundary. */ #define READ_BUF_SIZE 8192 struct dmadat { char rdbuf[READ_BUF_SIZE]; /* for reading large things */ char secbuf[READ_BUF_SIZE]; /* for MBR/disklabel */ }; static struct dmadat *dmadat; void exit(int); void reboot(void); static void load(void); static int parse_cmd(void); static void bios_getmem(void); void *malloc(size_t n); void free(void *ptr); int main(void); void * malloc(size_t n) { char *p = heap_next; if (p + n > heap_end) { printf("malloc failure\n"); for (;;) ; /* NOTREACHED */ return (0); } heap_next += n; return (p); } void free(void *ptr) { return; } static char * strdup(const char *s) { char *p = malloc(strlen(s) + 1); strcpy(p, s); return (p); } #ifdef LOADER_GELI_SUPPORT #include "geliboot.c" static char gelipw[GELI_PW_MAXLEN]; static struct keybuf *gelibuf; #endif #include "zfsimpl.c" /* * Read from a dnode (which must be from a ZPL filesystem). */ static int zfs_read(spa_t *spa, const dnode_phys_t *dnode, off_t *offp, void *start, size_t size) { const znode_phys_t *zp = (const znode_phys_t *) dnode->dn_bonus; size_t n; int rc; n = size; if (*offp + n > zp->zp_size) n = zp->zp_size - *offp; rc = dnode_read(spa, dnode, *offp, start, n); if (rc) return (-1); *offp += n; return (n); } /* * Current ZFS pool */ static spa_t *spa; static spa_t *primary_spa; static vdev_t *primary_vdev; /* * A wrapper for dskread that doesn't have to worry about whether the * buffer pointer crosses a 64k boundary. */ static int -vdev_read(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes) +vdev_read(void *xvdev, void *priv, off_t off, void *buf, size_t bytes) { char *p; daddr_t lba, alignlba; off_t diff; unsigned int nb, alignnb; struct dsk *dsk = (struct dsk *) priv; if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1))) return -1; p = buf; lba = off / DEV_BSIZE; lba += dsk->start; /* * Align reads to 4k else 4k sector GELIs will not decrypt. * Round LBA down to nearest multiple of DEV_GELIBOOT_BSIZE bytes. */ alignlba = rounddown2(off, DEV_GELIBOOT_BSIZE) / DEV_BSIZE; /* * The read must be aligned to DEV_GELIBOOT_BSIZE bytes relative to the * start of the GELI partition, not the start of the actual disk. */ alignlba += dsk->start; diff = (lba - alignlba) * DEV_BSIZE; while (bytes > 0) { nb = bytes / DEV_BSIZE; /* * Ensure that the read size plus the leading offset does not * exceed the size of the read buffer. */ if (nb > (READ_BUF_SIZE - diff) / DEV_BSIZE) nb = (READ_BUF_SIZE - diff) / DEV_BSIZE; /* * Round the number of blocks to read up to the nearest multiple * of DEV_GELIBOOT_BSIZE. */ alignnb = roundup2(nb * DEV_BSIZE + diff, DEV_GELIBOOT_BSIZE) / DEV_BSIZE; if (drvread(dsk, dmadat->rdbuf, alignlba, alignnb)) return -1; #ifdef LOADER_GELI_SUPPORT /* decrypt */ if (is_geli(dsk) == 0) { if (geli_read(dsk, ((alignlba - dsk->start) * DEV_BSIZE), dmadat->rdbuf, alignnb * DEV_BSIZE)) return (-1); } #endif memcpy(p, dmadat->rdbuf + diff, nb * DEV_BSIZE); p += nb * DEV_BSIZE; lba += nb; alignlba += alignnb; bytes -= nb * DEV_BSIZE; /* Don't need the leading offset after the first block. */ diff = 0; } return 0; } +/* Match the signature exactly due to signature madness */ +static int +vdev_read2(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes) +{ + return vdev_read(vdev, priv, off, buf, bytes); +} + static int vdev_write(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes) { char *p; daddr_t lba; unsigned int nb; struct dsk *dsk = (struct dsk *) priv; if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1))) return -1; p = buf; lba = off / DEV_BSIZE; lba += dsk->start; while (bytes > 0) { nb = bytes / DEV_BSIZE; if (nb > READ_BUF_SIZE / DEV_BSIZE) nb = READ_BUF_SIZE / DEV_BSIZE; memcpy(dmadat->rdbuf, p, nb * DEV_BSIZE); if (drvwrite(dsk, dmadat->rdbuf, lba, nb)) return -1; p += nb * DEV_BSIZE; lba += nb; bytes -= nb * DEV_BSIZE; } return 0; } static int xfsread(const dnode_phys_t *dnode, off_t *offp, void *buf, size_t nbyte) { if ((size_t)zfs_read(spa, dnode, offp, buf, nbyte) != nbyte) { printf("Invalid format\n"); return -1; } return 0; } /* * Read Pad2 (formerly "Boot Block Header") area of the first * vdev label of the given vdev. */ static int vdev_read_pad2(vdev_t *vdev, char *buf, size_t size) { blkptr_t bp; char *tmp = zap_scratch; off_t off = offsetof(vdev_label_t, vl_pad2); if (size > VDEV_PAD_SIZE) size = VDEV_PAD_SIZE; BP_ZERO(&bp); BP_SET_LSIZE(&bp, VDEV_PAD_SIZE); BP_SET_PSIZE(&bp, VDEV_PAD_SIZE); BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); DVA_SET_OFFSET(BP_IDENTITY(&bp), off); if (vdev_read_phys(vdev, &bp, tmp, off, 0)) return (EIO); memcpy(buf, tmp, size); return (0); } static int vdev_clear_pad2(vdev_t *vdev) { char *zeroes = zap_scratch; uint64_t *end; off_t off = offsetof(vdev_label_t, vl_pad2); memset(zeroes, 0, VDEV_PAD_SIZE); end = (uint64_t *)(zeroes + VDEV_PAD_SIZE); /* ZIO_CHECKSUM_LABEL magic and pre-calcualted checksum for all zeros */ end[-5] = 0x0210da7ab10c7a11; end[-4] = 0x97f48f807f6e2a3f; end[-3] = 0xaf909f1658aacefc; end[-2] = 0xcbd1ea57ff6db48b; end[-1] = 0x6ec692db0d465fab; if (vdev_write(vdev, vdev->v_read_priv, off, zeroes, VDEV_PAD_SIZE)) return (EIO); return (0); } static void bios_getmem(void) { uint64_t size; /* Parse system memory map */ v86.ebx = 0; do { v86.ctl = V86_FLAGS; v86.addr = 0x15; /* int 0x15 function 0xe820*/ v86.eax = 0xe820; v86.ecx = sizeof(struct bios_smap); v86.edx = SMAP_SIG; v86.es = VTOPSEG(&smap); v86.edi = VTOPOFF(&smap); v86int(); if (V86_CY(v86.efl) || (v86.eax != SMAP_SIG)) break; /* look for a low-memory segment that's large enough */ if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base == 0) && (smap.length >= (512 * 1024))) bios_basemem = smap.length; /* look for the first segment in 'extended' memory */ if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base == 0x100000)) { bios_extmem = smap.length; } /* * Look for the largest segment in 'extended' memory beyond * 1MB but below 4GB. */ if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base > 0x100000) && (smap.base < 0x100000000ull)) { size = smap.length; /* * If this segment crosses the 4GB boundary, truncate it. */ if (smap.base + size > 0x100000000ull) size = 0x100000000ull - smap.base; if (size > high_heap_size) { high_heap_size = size; high_heap_base = smap.base; } } } while (v86.ebx != 0); /* Fall back to the old compatibility function for base memory */ if (bios_basemem == 0) { v86.ctl = 0; v86.addr = 0x12; /* int 0x12 */ v86int(); bios_basemem = (v86.eax & 0xffff) * 1024; } /* Fall back through several compatibility functions for extended memory */ if (bios_extmem == 0) { v86.ctl = V86_FLAGS; v86.addr = 0x15; /* int 0x15 function 0xe801*/ v86.eax = 0xe801; v86int(); if (!V86_CY(v86.efl)) { bios_extmem = ((v86.ecx & 0xffff) + ((v86.edx & 0xffff) * 64)) * 1024; } } if (bios_extmem == 0) { v86.ctl = 0; v86.addr = 0x15; /* int 0x15 function 0x88*/ v86.eax = 0x8800; v86int(); bios_extmem = (v86.eax & 0xffff) * 1024; } /* * If we have extended memory and did not find a suitable heap * region in the SMAP, use the last 3MB of 'extended' memory as a * high heap candidate. */ if (bios_extmem >= HEAP_MIN && high_heap_size < HEAP_MIN) { high_heap_size = HEAP_MIN; high_heap_base = bios_extmem + 0x100000 - HEAP_MIN; } } /* * Try to detect a device supported by the legacy int13 BIOS */ static int int13probe(int drive) { v86.ctl = V86_FLAGS; v86.addr = 0x13; v86.eax = 0x800; v86.edx = drive; v86int(); if (!V86_CY(v86.efl) && /* carry clear */ ((v86.edx & 0xff) != (drive & DRV_MASK))) { /* unit # OK */ if ((v86.ecx & 0x3f) == 0) { /* absurd sector size */ return(0); /* skip device */ } return (1); } return(0); } /* * We call this when we find a ZFS vdev - ZFS consumes the dsk * structure so we must make a new one. */ static struct dsk * copy_dsk(struct dsk *dsk) { struct dsk *newdsk; newdsk = malloc(sizeof(struct dsk)); *newdsk = *dsk; return (newdsk); } /* * Get disk size from eax=0x800 and 0x4800. We need to probe both * because 0x4800 may not be available and we would like to get more * or less correct disk size - if it is possible at all. * Note we do not really want to touch drv.c because that code is shared * with boot2 and we can not afford to grow that code. */ static uint64_t drvsize_ext(struct dsk *dskp) { uint64_t size, tmp; int cyl, hds, sec; v86.ctl = V86_FLAGS; v86.addr = 0x13; v86.eax = 0x800; v86.edx = dskp->drive; v86int(); /* Don't error out if we get bad sector number, try EDD as well */ if (V86_CY(v86.efl) || /* carry set */ (v86.edx & 0xff) <= (unsigned)(dskp->drive & 0x7f)) /* unit # bad */ return (0); cyl = ((v86.ecx & 0xc0) << 2) + ((v86.ecx & 0xff00) >> 8) + 1; /* Convert max head # -> # of heads */ hds = ((v86.edx & 0xff00) >> 8) + 1; sec = v86.ecx & 0x3f; size = (uint64_t)cyl * hds * sec; /* Determine if we can use EDD with this device. */ v86.ctl = V86_FLAGS; v86.addr = 0x13; v86.eax = 0x4100; v86.edx = dskp->drive; v86.ebx = 0x55aa; v86int(); if (V86_CY(v86.efl) || /* carry set */ (v86.ebx & 0xffff) != 0xaa55 || /* signature */ (v86.ecx & EDD_INTERFACE_FIXED_DISK) == 0) return (size); tmp = drvsize(dskp); if (tmp > size) size = tmp; return (size); } /* * The "layered" ioctl to read disk/partition size. Unfortunately * the zfsboot case is hardest, because we do not have full software * stack available, so we need to do some manual work here. */ uint64_t ldi_get_size(void *priv) { struct dsk *dskp = priv; uint64_t size = dskp->size; if (dskp->start == 0) size = drvsize_ext(dskp); return (size * DEV_BSIZE); } static void probe_drive(struct dsk *dsk) { #ifdef GPT struct gpt_hdr hdr; struct gpt_ent *ent; unsigned part, entries_per_sec; daddr_t slba; #endif #if defined(GPT) || defined(LOADER_GELI_SUPPORT) daddr_t elba; #endif struct dos_partition *dp; char *sec; unsigned i; /* * If we find a vdev on the whole disk, stop here. */ - if (vdev_probe(vdev_read, dsk, NULL) == 0) + if (vdev_probe(vdev_read2, dsk, NULL) == 0) return; #ifdef LOADER_GELI_SUPPORT /* * Taste the disk, if it is GELI encrypted, decrypt it and check to see if * it is a usable vdev then. Otherwise dig * out the partition table and probe each slice/partition * in turn for a vdev or GELI encrypted vdev. */ elba = drvsize_ext(dsk); if (elba > 0) { elba--; } if (geli_taste(vdev_read, dsk, elba) == 0) { - if (geli_havekey(dsk) == 0 || geli_passphrase(&gelipw, dsk->unit, + if (geli_havekey(dsk) == 0 || geli_passphrase(gelipw, dsk->unit, ':', 0, dsk) == 0) { - if (vdev_probe(vdev_read, dsk, NULL) == 0) { + if (vdev_probe(vdev_read2, dsk, NULL) == 0) { return; } } } #endif /* LOADER_GELI_SUPPORT */ sec = dmadat->secbuf; dsk->start = 0; #ifdef GPT /* * First check for GPT. */ if (drvread(dsk, sec, 1, 1)) { return; } memcpy(&hdr, sec, sizeof(hdr)); if (memcmp(hdr.hdr_sig, GPT_HDR_SIG, sizeof(hdr.hdr_sig)) != 0 || hdr.hdr_lba_self != 1 || hdr.hdr_revision < 0x00010000 || hdr.hdr_entsz < sizeof(*ent) || DEV_BSIZE % hdr.hdr_entsz != 0) { goto trymbr; } /* * Probe all GPT partitions for the presence of ZFS pools. We * return the spa_t for the first we find (if requested). This * will have the effect of booting from the first pool on the * disk. * * If no vdev is found, GELI decrypting the device and try again */ entries_per_sec = DEV_BSIZE / hdr.hdr_entsz; slba = hdr.hdr_lba_table; elba = slba + hdr.hdr_entries / entries_per_sec; while (slba < elba) { dsk->start = 0; if (drvread(dsk, sec, slba, 1)) return; for (part = 0; part < entries_per_sec; part++) { ent = (struct gpt_ent *)(sec + part * hdr.hdr_entsz); if (memcmp(&ent->ent_type, &freebsd_zfs_uuid, sizeof(uuid_t)) == 0) { dsk->start = ent->ent_lba_start; dsk->size = ent->ent_lba_end - ent->ent_lba_start + 1; dsk->slice = part + 1; dsk->part = 255; - if (vdev_probe(vdev_read, dsk, NULL) == 0) { + if (vdev_probe(vdev_read2, dsk, NULL) == 0) { /* * This slice had a vdev. We need a new dsk * structure now since the vdev now owns this one. */ dsk = copy_dsk(dsk); } #ifdef LOADER_GELI_SUPPORT else if (geli_taste(vdev_read, dsk, ent->ent_lba_end - ent->ent_lba_start) == 0) { - if (geli_havekey(dsk) == 0 || geli_passphrase(&gelipw, + if (geli_havekey(dsk) == 0 || geli_passphrase(gelipw, dsk->unit, 'p', dsk->slice, dsk) == 0) { /* * This slice has GELI, check it for ZFS. */ - if (vdev_probe(vdev_read, dsk, NULL) == 0) { + if (vdev_probe(vdev_read2, dsk, NULL) == 0) { /* * This slice had a vdev. We need a new dsk * structure now since the vdev now owns this one. */ dsk = copy_dsk(dsk); } break; } } #endif /* LOADER_GELI_SUPPORT */ } } slba++; } return; trymbr: #endif /* GPT */ if (drvread(dsk, sec, DOSBBSECTOR, 1)) return; dp = (void *)(sec + DOSPARTOFF); for (i = 0; i < NDOSPART; i++) { if (!dp[i].dp_typ) continue; dsk->start = dp[i].dp_start; dsk->size = dp[i].dp_size; dsk->slice = i + 1; - if (vdev_probe(vdev_read, dsk, NULL) == 0) { + if (vdev_probe(vdev_read2, dsk, NULL) == 0) { dsk = copy_dsk(dsk); } #ifdef LOADER_GELI_SUPPORT else if (geli_taste(vdev_read, dsk, dp[i].dp_size - dp[i].dp_start) == 0) { - if (geli_havekey(dsk) == 0 || geli_passphrase(&gelipw, dsk->unit, + if (geli_havekey(dsk) == 0 || geli_passphrase(gelipw, dsk->unit, 's', i, dsk) == 0) { /* * This slice has GELI, check it for ZFS. */ - if (vdev_probe(vdev_read, dsk, NULL) == 0) { + if (vdev_probe(vdev_read2, dsk, NULL) == 0) { /* * This slice had a vdev. We need a new dsk * structure now since the vdev now owns this one. */ dsk = copy_dsk(dsk); } break; } } #endif /* LOADER_GELI_SUPPORT */ } } int main(void) { dnode_phys_t dn; off_t off; struct dsk *dsk; int autoboot, i; int nextboot; int rc; dmadat = (void *)(roundup2(__base + (int32_t)&_end, 0x10000) - __base); bios_getmem(); if (high_heap_size > 0) { heap_end = PTOV(high_heap_base + high_heap_size); heap_next = PTOV(high_heap_base); } else { heap_next = (char *)dmadat + sizeof(*dmadat); heap_end = (char *)PTOV(bios_basemem); } dsk = malloc(sizeof(struct dsk)); dsk->drive = *(uint8_t *)PTOV(ARGS); dsk->type = dsk->drive & DRV_HARD ? TYPE_AD : TYPE_FD; dsk->unit = dsk->drive & DRV_MASK; dsk->slice = *(uint8_t *)PTOV(ARGS + 1) + 1; dsk->part = 0; dsk->start = 0; dsk->size = 0; bootinfo.bi_version = BOOTINFO_VERSION; bootinfo.bi_size = sizeof(bootinfo); bootinfo.bi_basemem = bios_basemem / 1024; bootinfo.bi_extmem = bios_extmem / 1024; bootinfo.bi_memsizes_valid++; bootinfo.bi_bios_dev = dsk->drive; bootdev = MAKEBOOTDEV(dev_maj[dsk->type], dsk->slice, dsk->unit, dsk->part); /* Process configuration file */ autoboot = 1; #ifdef LOADER_GELI_SUPPORT geli_init(); #endif zfs_init(); /* * Probe the boot drive first - we will try to boot from whatever * pool we find on that drive. */ probe_drive(dsk); /* * Probe the rest of the drives that the bios knows about. This * will find any other available pools and it may fill in missing * vdevs for the boot pool. */ #ifndef VIRTUALBOX for (i = 0; i < *(unsigned char *)PTOV(BIOS_NUMDRIVES); i++) #else for (i = 0; i < MAXBDDEV; i++) #endif { if ((i | DRV_HARD) == *(uint8_t *)PTOV(ARGS)) continue; if (!int13probe(i | DRV_HARD)) break; dsk = malloc(sizeof(struct dsk)); dsk->drive = i | DRV_HARD; dsk->type = dsk->drive & TYPE_AD; dsk->unit = i; dsk->slice = 0; dsk->part = 0; dsk->start = 0; dsk->size = 0; probe_drive(dsk); } /* * The first discovered pool, if any, is the pool. */ spa = spa_get_primary(); if (!spa) { printf("%s: No ZFS pools located, can't boot\n", BOOTPROG); for (;;) ; } primary_spa = spa; primary_vdev = spa_get_primary_vdev(spa); nextboot = 0; rc = vdev_read_pad2(primary_vdev, cmd, sizeof(cmd)); if (vdev_clear_pad2(primary_vdev)) printf("failed to clear pad2 area of primary vdev\n"); if (rc == 0) { if (*cmd) { /* * We could find an old-style ZFS Boot Block header here. * Simply ignore it. */ if (*(uint64_t *)cmd != 0x2f5b007b10c) { /* * Note that parse() is destructive to cmd[] and we also want * to honor RBX_QUIET option that could be present in cmd[]. */ nextboot = 1; memcpy(cmddup, cmd, sizeof(cmd)); if (parse_cmd()) { printf("failed to parse pad2 area of primary vdev\n"); reboot(); } if (!OPT_CHECK(RBX_QUIET)) printf("zfs nextboot: %s\n", cmddup); } /* Do not process this command twice */ *cmd = 0; } } else printf("failed to read pad2 area of primary vdev\n"); /* Mount ZFS only if it's not already mounted via nextboot parsing. */ if (zfsmount.spa == NULL && (zfs_spa_init(spa) != 0 || zfs_mount(spa, 0, &zfsmount) != 0)) { printf("%s: failed to mount default pool %s\n", BOOTPROG, spa->spa_name); autoboot = 0; } else if (zfs_lookup(&zfsmount, PATH_CONFIG, &dn) == 0 || zfs_lookup(&zfsmount, PATH_DOTCONFIG, &dn) == 0) { off = 0; zfs_read(spa, &dn, &off, cmd, sizeof(cmd)); } if (*cmd) { /* * Note that parse_cmd() is destructive to cmd[] and we also want * to honor RBX_QUIET option that could be present in cmd[]. */ memcpy(cmddup, cmd, sizeof(cmd)); if (parse_cmd()) autoboot = 0; if (!OPT_CHECK(RBX_QUIET)) printf("%s: %s\n", PATH_CONFIG, cmddup); /* Do not process this command twice */ *cmd = 0; } /* Do not risk waiting at the prompt forever. */ if (nextboot && !autoboot) reboot(); /* * Try to exec /boot/loader. If interrupted by a keypress, * or in case of failure, try to load a kernel directly instead. */ if (autoboot && !*kname) { memcpy(kname, PATH_LOADER_ZFS, sizeof(PATH_LOADER_ZFS)); if (!keyhit(3)) { load(); memcpy(kname, PATH_KERNEL, sizeof(PATH_KERNEL)); } } /* Present the user with the boot2 prompt. */ for (;;) { if (!autoboot || !OPT_CHECK(RBX_QUIET)) { printf("\nFreeBSD/x86 boot\n"); if (zfs_rlookup(spa, zfsmount.rootobj, rootname) != 0) printf("Default: %s/<0x%llx>:%s\n" "boot: ", spa->spa_name, zfsmount.rootobj, kname); else if (rootname[0] != '\0') printf("Default: %s/%s:%s\n" "boot: ", spa->spa_name, rootname, kname); else printf("Default: %s:%s\n" "boot: ", spa->spa_name, kname); } if (ioctrl & IO_SERIAL) sio_flush(); if (!autoboot || keyhit(5)) getstr(cmd, sizeof(cmd)); else if (!autoboot || !OPT_CHECK(RBX_QUIET)) putchar('\n'); autoboot = 0; if (parse_cmd()) putchar('\a'); else load(); } } /* XXX - Needed for btxld to link the boot2 binary; do not remove. */ void exit(int x) { __exit(x); } void reboot(void) { __exit(0); } static void load(void) { union { struct exec ex; Elf32_Ehdr eh; } hdr; static Elf32_Phdr ep[2]; static Elf32_Shdr es[2]; caddr_t p; dnode_phys_t dn; off_t off; uint32_t addr, x; int fmt, i, j; if (zfs_lookup(&zfsmount, kname, &dn)) { printf("\nCan't find %s\n", kname); return; } off = 0; if (xfsread(&dn, &off, &hdr, sizeof(hdr))) return; if (N_GETMAGIC(hdr.ex) == ZMAGIC) fmt = 0; else if (IS_ELF(hdr.eh)) fmt = 1; else { printf("Invalid %s\n", "format"); return; } if (fmt == 0) { addr = hdr.ex.a_entry & 0xffffff; p = PTOV(addr); off = PAGE_SIZE; if (xfsread(&dn, &off, p, hdr.ex.a_text)) return; p += roundup2(hdr.ex.a_text, PAGE_SIZE); if (xfsread(&dn, &off, p, hdr.ex.a_data)) return; p += hdr.ex.a_data + roundup2(hdr.ex.a_bss, PAGE_SIZE); bootinfo.bi_symtab = VTOP(p); memcpy(p, &hdr.ex.a_syms, sizeof(hdr.ex.a_syms)); p += sizeof(hdr.ex.a_syms); if (hdr.ex.a_syms) { if (xfsread(&dn, &off, p, hdr.ex.a_syms)) return; p += hdr.ex.a_syms; if (xfsread(&dn, &off, p, sizeof(int))) return; x = *(uint32_t *)p; p += sizeof(int); x -= sizeof(int); if (xfsread(&dn, &off, p, x)) return; p += x; } } else { off = hdr.eh.e_phoff; for (j = i = 0; i < hdr.eh.e_phnum && j < 2; i++) { if (xfsread(&dn, &off, ep + j, sizeof(ep[0]))) return; if (ep[j].p_type == PT_LOAD) j++; } for (i = 0; i < 2; i++) { p = PTOV(ep[i].p_paddr & 0xffffff); off = ep[i].p_offset; if (xfsread(&dn, &off, p, ep[i].p_filesz)) return; } p += roundup2(ep[1].p_memsz, PAGE_SIZE); bootinfo.bi_symtab = VTOP(p); if (hdr.eh.e_shnum == hdr.eh.e_shstrndx + 3) { off = hdr.eh.e_shoff + sizeof(es[0]) * (hdr.eh.e_shstrndx + 1); if (xfsread(&dn, &off, &es, sizeof(es))) return; for (i = 0; i < 2; i++) { memcpy(p, &es[i].sh_size, sizeof(es[i].sh_size)); p += sizeof(es[i].sh_size); off = es[i].sh_offset; if (xfsread(&dn, &off, p, es[i].sh_size)) return; p += es[i].sh_size; } } addr = hdr.eh.e_entry & 0xffffff; } bootinfo.bi_esymtab = VTOP(p); bootinfo.bi_kernelname = VTOP(kname); zfsargs.size = sizeof(zfsargs); zfsargs.pool = zfsmount.spa->spa_guid; zfsargs.root = zfsmount.rootobj; zfsargs.primary_pool = primary_spa->spa_guid; #ifdef LOADER_GELI_SUPPORT explicit_bzero(gelipw, sizeof(gelipw)); gelibuf = malloc(sizeof(struct keybuf) + (GELI_MAX_KEYS * sizeof(struct keybuf_ent))); geli_fill_keybuf(gelibuf); zfsargs.notapw = '\0'; zfsargs.keybuf_sentinel = KEYBUF_SENTINEL; zfsargs.keybuf = gelibuf; #else zfsargs.gelipw[0] = '\0'; #endif if (primary_vdev != NULL) zfsargs.primary_vdev = primary_vdev->v_guid; else printf("failed to detect primary vdev\n"); __exec((caddr_t)addr, RB_BOOTINFO | (opts & RBX_MASK), bootdev, KARGS_FLAGS_ZFS | KARGS_FLAGS_EXTARG, (uint32_t) spa->spa_guid, (uint32_t) (spa->spa_guid >> 32), VTOP(&bootinfo), zfsargs); } static int zfs_mount_ds(char *dsname) { uint64_t newroot; spa_t *newspa; char *q; q = strchr(dsname, '/'); if (q) *q++ = '\0'; newspa = spa_find_by_name(dsname); if (newspa == NULL) { printf("\nCan't find ZFS pool %s\n", dsname); return -1; } if (zfs_spa_init(newspa)) return -1; newroot = 0; if (q) { if (zfs_lookup_dataset(newspa, q, &newroot)) { printf("\nCan't find dataset %s in ZFS pool %s\n", q, newspa->spa_name); return -1; } } if (zfs_mount(newspa, newroot, &zfsmount)) { printf("\nCan't mount ZFS dataset\n"); return -1; } spa = newspa; return (0); } static int parse_cmd(void) { char *arg = cmd; char *ep, *p, *q; const char *cp; int c, i, j; while ((c = *arg++)) { if (c == ' ' || c == '\t' || c == '\n') continue; for (p = arg; *p && *p != '\n' && *p != ' ' && *p != '\t'; p++); ep = p; if (*p) *p++ = 0; if (c == '-') { while ((c = *arg++)) { if (c == 'P') { if (*(uint8_t *)PTOV(0x496) & 0x10) { cp = "yes"; } else { opts |= OPT_SET(RBX_DUAL) | OPT_SET(RBX_SERIAL); cp = "no"; } printf("Keyboard: %s\n", cp); continue; } else if (c == 'S') { j = 0; while ((unsigned int)(i = *arg++ - '0') <= 9) j = j * 10 + i; if (j > 0 && i == -'0') { comspeed = j; break; } /* Fall through to error below ('S' not in optstr[]). */ } for (i = 0; c != optstr[i]; i++) if (i == NOPT - 1) return -1; opts ^= OPT_SET(flags[i]); } ioctrl = OPT_CHECK(RBX_DUAL) ? (IO_SERIAL|IO_KEYBOARD) : OPT_CHECK(RBX_SERIAL) ? IO_SERIAL : IO_KEYBOARD; if (ioctrl & IO_SERIAL) { if (sio_init(115200 / comspeed) != 0) ioctrl &= ~IO_SERIAL; } } if (c == '?') { dnode_phys_t dn; if (zfs_lookup(&zfsmount, arg, &dn) == 0) { zap_list(spa, &dn); } return -1; } else { arg--; /* * Report pool status if the comment is 'status'. Lets * hope no-one wants to load /status as a kernel. */ if (!strcmp(arg, "status")) { spa_all_status(); return -1; } /* * If there is "zfs:" prefix simply ignore it. */ if (strncmp(arg, "zfs:", 4) == 0) arg += 4; /* * If there is a colon, switch pools. */ q = strchr(arg, ':'); if (q) { *q++ = '\0'; if (zfs_mount_ds(arg) != 0) return -1; arg = q; } if ((i = ep - arg)) { if ((size_t)i >= sizeof(kname)) return -1; memcpy(kname, arg, i + 1); } } arg = p; } return 0; } Index: head/stand/libsa/Makefile =================================================================== --- head/stand/libsa/Makefile (revision 326447) +++ head/stand/libsa/Makefile (revision 326448) @@ -1,157 +1,156 @@ # $FreeBSD$ # Originally from $NetBSD: Makefile,v 1.21 1997/10/26 22:08:38 lukem Exp $ # # Notes: # - We don't use the libc strerror/sys_errlist because the string table is # quite large. # MK_PROFILE= no MK_SSP= no .include INTERNALLIB= LIBSA_CPUARCH?=${MACHINE_CPUARCH} LIBC_SRC= ${SRCTOP}/lib/libc LIB?= sa NO_PIC= -WARNS?= 0 # standalone components and stuff we have modified locally SRCS+= gzguts.h zutil.h __main.c assert.c bcd.c environment.c getopt.c gets.c \ globals.c pager.c panic.c printf.c strdup.c strerror.c strtol.c strtoul.c \ random.c sbrk.c twiddle.c zalloc.c zalloc_malloc.c # private (pruned) versions of libc string functions SRCS+= strcasecmp.c .PATH: ${LIBC_SRC}/net SRCS+= ntoh.c # string functions from libc .PATH: ${LIBC_SRC}/string SRCS+= bcmp.c bcopy.c bzero.c ffs.c fls.c \ memccpy.c memchr.c memcmp.c memcpy.c memmove.c memset.c \ qdivrem.c strcat.c strchr.c strcmp.c strcpy.c stpcpy.c stpncpy.c \ strcspn.c strlcat.c strlcpy.c strlen.c strncat.c strncmp.c strncpy.c \ strpbrk.c strrchr.c strsep.c strspn.c strstr.c strtok.c swab.c .if ${MACHINE_CPUARCH} == "arm" .PATH: ${LIBC_SRC}/arm/gen # Do not generate movt/movw, because the relocation fixup for them does not # translate to the -Bsymbolic -pie format required by self_reloc() in loader(8). # Also, the fpu is not available in a standalone environment. .if ${COMPILER_VERSION} < 30800 CFLAGS.clang+= -mllvm -arm-use-movt=0 .else CFLAGS.clang+= -mno-movt .endif CFLAGS.clang+= -mfpu=none # Compiler support functions .PATH: ${SRCTOP}/contrib/compiler-rt/lib/builtins/ # __clzsi2 and ctzsi2 for various builtin functions SRCS+= clzsi2.c ctzsi2.c # Divide and modulus functions called by the compiler SRCS+= divmoddi4.c divmodsi4.c divdi3.c divsi3.c moddi3.c modsi3.c SRCS+= udivmoddi4.c udivmodsi4.c udivdi3.c udivsi3.c umoddi3.c umodsi3.c .PATH: ${SRCTOP}/contrib/compiler-rt/lib/builtins/arm/ SRCS+= aeabi_idivmod.S aeabi_ldivmod.S aeabi_uidivmod.S aeabi_uldivmod.S SRCS+= aeabi_memcmp.S aeabi_memcpy.S aeabi_memmove.S aeabi_memset.S .endif .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "riscv" .PATH: ${LIBC_SRC}/${MACHINE_CPUARCH}/gen .endif .if ${MACHINE_CPUARCH} == "powerpc" .PATH: ${LIBC_SRC}/quad SRCS+= ashldi3.c ashrdi3.c SRCS+= syncicache.c .endif # uuid functions from libc .PATH: ${LIBC_SRC}/uuid SRCS+= uuid_create_nil.c uuid_equal.c uuid_from_string.c uuid_is_nil.c uuid_to_string.c # _setjmp/_longjmp .PATH: ${SASRC}/${LIBSA_CPUARCH} SRCS+= _setjmp.S # decompression functionality from libbz2 # NOTE: to actually test this functionality after libbz2 upgrade compile # loader(8) with LOADER_BZIP2_SUPPORT defined .PATH: ${SRCTOP}/contrib/bzip2 CFLAGS+= -DBZ_NO_STDIO -DBZ_NO_COMPRESS SRCS+= libsa_bzlib_private.h .for file in bzlib.c crctable.c decompress.c huffman.c randtable.c SRCS+= _${file} CLEANFILES+= _${file} _${file}: ${file} sed "s|bzlib_private\.h|libsa_bzlib_private.h|" \ ${.ALLSRC} > ${.TARGET} .endfor CLEANFILES+= libsa_bzlib_private.h libsa_bzlib_private.h: bzlib_private.h sed -e 's||"stand.h"|' \ ${.ALLSRC} > ${.TARGET} # decompression functionality from zlib .PATH: ${SRCTOP}/contrib/zlib CFLAGS+=-DHAVE_MEMCPY -I${SRCTOP}/contrib/zlib SRCS+= adler32.c crc32.c libsa_zutil.h libsa_gzguts.h .for file in infback.c inffast.c inflate.c inftrees.c zutil.c SRCS+= _${file} CLEANFILES+= _${file} _${file}: ${file} sed -e "s|zutil\.h|libsa_zutil.h|" \ -e "s|gzguts\.h|libsa_gzguts.h|" \ ${.ALLSRC} > ${.TARGET} .endfor # depend on stand.h being able to be included multiple times .for file in zutil.h gzguts.h CLEANFILES+= libsa_${file} libsa_${file}: ${file} sed -e 's||"stand.h"|' \ -e 's||"stand.h"|' \ -e 's||"stand.h"|' \ -e 's||"stand.h"|' \ -e 's||"stand.h"|' \ ${.ALLSRC} > ${.TARGET} .endfor # io routines SRCS+= closeall.c dev.c ioctl.c nullfs.c stat.c \ fstat.c close.c lseek.c open.c read.c write.c readdir.c # network routines SRCS+= arp.c ether.c ip.c inet_ntoa.c in_cksum.c net.c udp.c netif.c rpc.c # network info services: SRCS+= bootp.c rarp.c bootparam.c # boot filesystems SRCS+= ufs.c nfs.c cd9660.c tftp.c gzipfs.c bzipfs.c SRCS+= dosfs.c ext2fs.c SRCS+= splitfs.c SRCS+= pkgfs.c .if ${MK_NAND} != "no" SRCS+= nandfs.c .endif # explicit_bzero .PATH: ${SYSDIR}/libkern SRCS+= explicit_bzero.c .include .include Index: head/stand/zfs/zfsimpl.c =================================================================== --- head/stand/zfs/zfsimpl.c (revision 326447) +++ head/stand/zfs/zfsimpl.c (revision 326448) @@ -1,2536 +1,2537 @@ /*- * Copyright (c) 2007 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Stand-alone ZFS file reader. */ #include #include #include "zfsimpl.h" #include "zfssubr.c" struct zfsmount { const spa_t *spa; objset_phys_t objset; uint64_t rootobj; }; +static struct zfsmount zfsmount; /* * List of all vdevs, chained through v_alllink. */ static vdev_list_t zfs_vdevs; /* * List of ZFS features supported for read */ static const char *features_for_read[] = { "org.illumos:lz4_compress", "com.delphix:hole_birth", "com.delphix:extensible_dataset", "com.delphix:embedded_data", "org.open-zfs:large_blocks", "org.illumos:sha512", "org.illumos:skein", "org.zfsonlinux:large_dnode", NULL }; /* * List of all pools, chained through spa_link. */ static spa_list_t zfs_pools; static const dnode_phys_t *dnode_cache_obj; static uint64_t dnode_cache_bn; static char *dnode_cache_buf; static char *zap_scratch; static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr; #define TEMP_SIZE (1024 * 1024) static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf); static int zfs_get_root(const spa_t *spa, uint64_t *objid); static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result); static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t integer_size, uint64_t num_integers, void *value); static void zfs_init(void) { STAILQ_INIT(&zfs_vdevs); STAILQ_INIT(&zfs_pools); zfs_temp_buf = malloc(TEMP_SIZE); zfs_temp_end = zfs_temp_buf + TEMP_SIZE; zfs_temp_ptr = zfs_temp_buf; dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE); zap_scratch = malloc(SPA_MAXBLOCKSIZE); zfs_init_crc(); } static void * zfs_alloc(size_t size) { char *ptr; if (zfs_temp_ptr + size > zfs_temp_end) { printf("ZFS: out of temporary buffer space\n"); for (;;) ; } ptr = zfs_temp_ptr; zfs_temp_ptr += size; return (ptr); } static void zfs_free(void *ptr, size_t size) { zfs_temp_ptr -= size; if (zfs_temp_ptr != ptr) { printf("ZFS: zfs_alloc()/zfs_free() mismatch\n"); for (;;) ; } } static int xdr_int(const unsigned char **xdr, int *ip) { *ip = ((*xdr)[0] << 24) | ((*xdr)[1] << 16) | ((*xdr)[2] << 8) | ((*xdr)[3] << 0); (*xdr) += 4; return (0); } static int xdr_u_int(const unsigned char **xdr, u_int *ip) { *ip = ((*xdr)[0] << 24) | ((*xdr)[1] << 16) | ((*xdr)[2] << 8) | ((*xdr)[3] << 0); (*xdr) += 4; return (0); } static int xdr_uint64_t(const unsigned char **xdr, uint64_t *lp) { u_int hi, lo; xdr_u_int(xdr, &hi); xdr_u_int(xdr, &lo); *lp = (((uint64_t) hi) << 32) | lo; return (0); } static int nvlist_find(const unsigned char *nvlist, const char *name, int type, int* elementsp, void *valuep) { const unsigned char *p, *pair; int junk; int encoded_size, decoded_size; p = nvlist; xdr_int(&p, &junk); xdr_int(&p, &junk); pair = p; xdr_int(&p, &encoded_size); xdr_int(&p, &decoded_size); while (encoded_size && decoded_size) { int namelen, pairtype, elements; const char *pairname; xdr_int(&p, &namelen); pairname = (const char*) p; p += roundup(namelen, 4); xdr_int(&p, &pairtype); if (!memcmp(name, pairname, namelen) && type == pairtype) { xdr_int(&p, &elements); if (elementsp) *elementsp = elements; if (type == DATA_TYPE_UINT64) { xdr_uint64_t(&p, (uint64_t *) valuep); return (0); } else if (type == DATA_TYPE_STRING) { int len; xdr_int(&p, &len); (*(const char**) valuep) = (const char*) p; return (0); } else if (type == DATA_TYPE_NVLIST || type == DATA_TYPE_NVLIST_ARRAY) { (*(const unsigned char**) valuep) = (const unsigned char*) p; return (0); } else { return (EIO); } } else { /* * Not the pair we are looking for, skip to the next one. */ p = pair + encoded_size; } pair = p; xdr_int(&p, &encoded_size); xdr_int(&p, &decoded_size); } return (EIO); } static int nvlist_check_features_for_read(const unsigned char *nvlist) { const unsigned char *p, *pair; int junk; int encoded_size, decoded_size; int rc; rc = 0; p = nvlist; xdr_int(&p, &junk); xdr_int(&p, &junk); pair = p; xdr_int(&p, &encoded_size); xdr_int(&p, &decoded_size); while (encoded_size && decoded_size) { int namelen, pairtype; const char *pairname; int i, found; found = 0; xdr_int(&p, &namelen); pairname = (const char*) p; p += roundup(namelen, 4); xdr_int(&p, &pairtype); for (i = 0; features_for_read[i] != NULL; i++) { if (!memcmp(pairname, features_for_read[i], namelen)) { found = 1; break; } } if (!found) { printf("ZFS: unsupported feature: %s\n", pairname); rc = EIO; } p = pair + encoded_size; pair = p; xdr_int(&p, &encoded_size); xdr_int(&p, &decoded_size); } return (rc); } /* * Return the next nvlist in an nvlist array. */ static const unsigned char * nvlist_next(const unsigned char *nvlist) { const unsigned char *p, *pair; int junk; int encoded_size, decoded_size; p = nvlist; xdr_int(&p, &junk); xdr_int(&p, &junk); pair = p; xdr_int(&p, &encoded_size); xdr_int(&p, &decoded_size); while (encoded_size && decoded_size) { p = pair + encoded_size; pair = p; xdr_int(&p, &encoded_size); xdr_int(&p, &decoded_size); } return p; } #ifdef TEST static const unsigned char * nvlist_print(const unsigned char *nvlist, unsigned int indent) { static const char* typenames[] = { "DATA_TYPE_UNKNOWN", "DATA_TYPE_BOOLEAN", "DATA_TYPE_BYTE", "DATA_TYPE_INT16", "DATA_TYPE_UINT16", "DATA_TYPE_INT32", "DATA_TYPE_UINT32", "DATA_TYPE_INT64", "DATA_TYPE_UINT64", "DATA_TYPE_STRING", "DATA_TYPE_BYTE_ARRAY", "DATA_TYPE_INT16_ARRAY", "DATA_TYPE_UINT16_ARRAY", "DATA_TYPE_INT32_ARRAY", "DATA_TYPE_UINT32_ARRAY", "DATA_TYPE_INT64_ARRAY", "DATA_TYPE_UINT64_ARRAY", "DATA_TYPE_STRING_ARRAY", "DATA_TYPE_HRTIME", "DATA_TYPE_NVLIST", "DATA_TYPE_NVLIST_ARRAY", "DATA_TYPE_BOOLEAN_VALUE", "DATA_TYPE_INT8", "DATA_TYPE_UINT8", "DATA_TYPE_BOOLEAN_ARRAY", "DATA_TYPE_INT8_ARRAY", "DATA_TYPE_UINT8_ARRAY" }; unsigned int i, j; const unsigned char *p, *pair; int junk; int encoded_size, decoded_size; p = nvlist; xdr_int(&p, &junk); xdr_int(&p, &junk); pair = p; xdr_int(&p, &encoded_size); xdr_int(&p, &decoded_size); while (encoded_size && decoded_size) { int namelen, pairtype, elements; const char *pairname; xdr_int(&p, &namelen); pairname = (const char*) p; p += roundup(namelen, 4); xdr_int(&p, &pairtype); for (i = 0; i < indent; i++) printf(" "); printf("%s %s", typenames[pairtype], pairname); xdr_int(&p, &elements); switch (pairtype) { case DATA_TYPE_UINT64: { uint64_t val; xdr_uint64_t(&p, &val); printf(" = 0x%jx\n", (uintmax_t)val); break; } case DATA_TYPE_STRING: { int len; xdr_int(&p, &len); printf(" = \"%s\"\n", p); break; } case DATA_TYPE_NVLIST: printf("\n"); nvlist_print(p, indent + 1); break; case DATA_TYPE_NVLIST_ARRAY: for (j = 0; j < elements; j++) { printf("[%d]\n", j); p = nvlist_print(p, indent + 1); if (j != elements - 1) { for (i = 0; i < indent; i++) printf(" "); printf("%s %s", typenames[pairtype], pairname); } } break; default: printf("\n"); } p = pair + encoded_size; pair = p; xdr_int(&p, &encoded_size); xdr_int(&p, &decoded_size); } return p; } #endif static int vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset, size_t size) { size_t psize; int rc; if (!vdev->v_phys_read) return (EIO); if (bp) { psize = BP_GET_PSIZE(bp); } else { psize = size; } /*printf("ZFS: reading %zu bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/ rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize); if (rc) return (rc); if (bp && zio_checksum_verify(vdev->spa, bp, buf)) return (EIO); return (0); } static int vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset, size_t bytes) { return (vdev_read_phys(vdev, bp, buf, offset + VDEV_LABEL_START_SIZE, bytes)); } static int vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset, size_t bytes) { vdev_t *kid; int rc; rc = EIO; STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { if (kid->v_state != VDEV_STATE_HEALTHY) continue; rc = kid->v_read(kid, bp, buf, offset, bytes); if (!rc) return (0); } return (rc); } static int vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset, size_t bytes) { vdev_t *kid; /* * Here we should have two kids: * First one which is the one we are replacing and we can trust * only this one to have valid data, but it might not be present. * Second one is that one we are replacing with. It is most likely * healthy, but we can't trust it has needed data, so we won't use it. */ kid = STAILQ_FIRST(&vdev->v_children); if (kid == NULL) return (EIO); if (kid->v_state != VDEV_STATE_HEALTHY) return (EIO); return (kid->v_read(kid, bp, buf, offset, bytes)); } static vdev_t * vdev_find(uint64_t guid) { vdev_t *vdev; STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink) if (vdev->v_guid == guid) return (vdev); return (0); } static vdev_t * vdev_create(uint64_t guid, vdev_read_t *_read) { vdev_t *vdev; vdev = malloc(sizeof(vdev_t)); memset(vdev, 0, sizeof(vdev_t)); STAILQ_INIT(&vdev->v_children); vdev->v_guid = guid; vdev->v_state = VDEV_STATE_OFFLINE; vdev->v_read = _read; vdev->v_phys_read = 0; vdev->v_read_priv = 0; STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink); return (vdev); } static int vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev, vdev_t **vdevp, int is_newer) { int rc; uint64_t guid, id, ashift, nparity; const char *type; const char *path; vdev_t *vdev, *kid; const unsigned char *kids; int nkids, i, is_new; uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present; if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, NULL, &guid) || nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, NULL, &type)) { printf("ZFS: can't find vdev details\n"); return (ENOENT); } if (strcmp(type, VDEV_TYPE_MIRROR) && strcmp(type, VDEV_TYPE_DISK) #ifdef ZFS_TEST && strcmp(type, VDEV_TYPE_FILE) #endif && strcmp(type, VDEV_TYPE_RAIDZ) && strcmp(type, VDEV_TYPE_REPLACING)) { printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n"); return (EIO); } is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0; nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL, &is_offline); nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL, &is_removed); nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL, &is_faulted); nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, NULL, &is_degraded); nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, NULL, &isnt_present); vdev = vdev_find(guid); if (!vdev) { is_new = 1; if (!strcmp(type, VDEV_TYPE_MIRROR)) vdev = vdev_create(guid, vdev_mirror_read); else if (!strcmp(type, VDEV_TYPE_RAIDZ)) vdev = vdev_create(guid, vdev_raidz_read); else if (!strcmp(type, VDEV_TYPE_REPLACING)) vdev = vdev_create(guid, vdev_replacing_read); else vdev = vdev_create(guid, vdev_disk_read); vdev->v_id = id; vdev->v_top = pvdev != NULL ? pvdev : vdev; if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT, DATA_TYPE_UINT64, NULL, &ashift) == 0) { vdev->v_ashift = ashift; } else { vdev->v_ashift = 0; } if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY, DATA_TYPE_UINT64, NULL, &nparity) == 0) { vdev->v_nparity = nparity; } else { vdev->v_nparity = 0; } if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH, DATA_TYPE_STRING, NULL, &path) == 0) { if (strncmp(path, "/dev/", 5) == 0) path += 5; vdev->v_name = strdup(path); } else { if (!strcmp(type, "raidz")) { if (vdev->v_nparity == 1) vdev->v_name = "raidz1"; else if (vdev->v_nparity == 2) vdev->v_name = "raidz2"; else if (vdev->v_nparity == 3) vdev->v_name = "raidz3"; else { printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n"); return (EIO); } } else { vdev->v_name = strdup(type); } } } else { is_new = 0; } if (is_new || is_newer) { /* * This is either new vdev or we've already seen this vdev, * but from an older vdev label, so let's refresh its state * from the newer label. */ if (is_offline) vdev->v_state = VDEV_STATE_OFFLINE; else if (is_removed) vdev->v_state = VDEV_STATE_REMOVED; else if (is_faulted) vdev->v_state = VDEV_STATE_FAULTED; else if (is_degraded) vdev->v_state = VDEV_STATE_DEGRADED; else if (isnt_present) vdev->v_state = VDEV_STATE_CANT_OPEN; } rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, &nkids, &kids); /* * Its ok if we don't have any kids. */ if (rc == 0) { vdev->v_nchildren = nkids; for (i = 0; i < nkids; i++) { rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer); if (rc) return (rc); if (is_new) STAILQ_INSERT_TAIL(&vdev->v_children, kid, v_childlink); kids = nvlist_next(kids); } } else { vdev->v_nchildren = 0; } if (vdevp) *vdevp = vdev; return (0); } static void vdev_set_state(vdev_t *vdev) { vdev_t *kid; int good_kids; int bad_kids; /* * A mirror or raidz is healthy if all its kids are healthy. A * mirror is degraded if any of its kids is healthy; a raidz * is degraded if at most nparity kids are offline. */ if (STAILQ_FIRST(&vdev->v_children)) { good_kids = 0; bad_kids = 0; STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { if (kid->v_state == VDEV_STATE_HEALTHY) good_kids++; else bad_kids++; } if (bad_kids == 0) { vdev->v_state = VDEV_STATE_HEALTHY; } else { if (vdev->v_read == vdev_mirror_read) { if (good_kids) { vdev->v_state = VDEV_STATE_DEGRADED; } else { vdev->v_state = VDEV_STATE_OFFLINE; } } else if (vdev->v_read == vdev_raidz_read) { if (bad_kids > vdev->v_nparity) { vdev->v_state = VDEV_STATE_OFFLINE; } else { vdev->v_state = VDEV_STATE_DEGRADED; } } } } } static spa_t * spa_find_by_guid(uint64_t guid) { spa_t *spa; STAILQ_FOREACH(spa, &zfs_pools, spa_link) if (spa->spa_guid == guid) return (spa); return (0); } static spa_t * spa_find_by_name(const char *name) { spa_t *spa; STAILQ_FOREACH(spa, &zfs_pools, spa_link) if (!strcmp(spa->spa_name, name)) return (spa); return (0); } #ifdef BOOT2 static spa_t * spa_get_primary(void) { return (STAILQ_FIRST(&zfs_pools)); } static vdev_t * spa_get_primary_vdev(const spa_t *spa) { vdev_t *vdev; vdev_t *kid; if (spa == NULL) spa = spa_get_primary(); if (spa == NULL) return (NULL); vdev = STAILQ_FIRST(&spa->spa_vdevs); if (vdev == NULL) return (NULL); for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL; kid = STAILQ_FIRST(&vdev->v_children)) vdev = kid; return (vdev); } #endif static spa_t * spa_create(uint64_t guid, const char *name) { spa_t *spa; if ((spa = malloc(sizeof(spa_t))) == NULL) return (NULL); memset(spa, 0, sizeof(spa_t)); if ((spa->spa_name = strdup(name)) == NULL) { free(spa); return (NULL); } STAILQ_INIT(&spa->spa_vdevs); spa->spa_guid = guid; STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link); return (spa); } static const char * state_name(vdev_state_t state) { static const char* names[] = { "UNKNOWN", "CLOSED", "OFFLINE", "REMOVED", "CANT_OPEN", "FAULTED", "DEGRADED", "ONLINE" }; return names[state]; } #ifdef BOOT2 #define pager_printf printf #else static int pager_printf(const char *fmt, ...) { char line[80]; va_list args; va_start(args, fmt); vsprintf(line, fmt, args); va_end(args); return (pager_output(line)); } #endif #define STATUS_FORMAT " %s %s\n" static int print_state(int indent, const char *name, vdev_state_t state) { char buf[512]; int i; buf[0] = 0; for (i = 0; i < indent; i++) strcat(buf, " "); strcat(buf, name); return (pager_printf(STATUS_FORMAT, buf, state_name(state))); } static int vdev_status(vdev_t *vdev, int indent) { vdev_t *kid; int ret; ret = print_state(indent, vdev->v_name, vdev->v_state); if (ret != 0) return (ret); STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { ret = vdev_status(kid, indent + 1); if (ret != 0) return (ret); } return (ret); } static int spa_status(spa_t *spa) { static char bootfs[ZFS_MAXNAMELEN]; uint64_t rootid; vdev_t *vdev; int good_kids, bad_kids, degraded_kids, ret; vdev_state_t state; ret = pager_printf(" pool: %s\n", spa->spa_name); if (ret != 0) return (ret); if (zfs_get_root(spa, &rootid) == 0 && zfs_rlookup(spa, rootid, bootfs) == 0) { if (bootfs[0] == '\0') ret = pager_printf("bootfs: %s\n", spa->spa_name); else ret = pager_printf("bootfs: %s/%s\n", spa->spa_name, bootfs); if (ret != 0) return (ret); } ret = pager_printf("config:\n\n"); if (ret != 0) return (ret); ret = pager_printf(STATUS_FORMAT, "NAME", "STATE"); if (ret != 0) return (ret); good_kids = 0; degraded_kids = 0; bad_kids = 0; STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { if (vdev->v_state == VDEV_STATE_HEALTHY) good_kids++; else if (vdev->v_state == VDEV_STATE_DEGRADED) degraded_kids++; else bad_kids++; } state = VDEV_STATE_CLOSED; if (good_kids > 0 && (degraded_kids + bad_kids) == 0) state = VDEV_STATE_HEALTHY; else if ((good_kids + degraded_kids) > 0) state = VDEV_STATE_DEGRADED; ret = print_state(0, spa->spa_name, state); if (ret != 0) return (ret); STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { ret = vdev_status(vdev, 1); if (ret != 0) return (ret); } return (ret); } static int spa_all_status(void) { spa_t *spa; int first = 1, ret = 0; STAILQ_FOREACH(spa, &zfs_pools, spa_link) { if (!first) { ret = pager_printf("\n"); if (ret != 0) return (ret); } first = 0; ret = spa_status(spa); if (ret != 0) return (ret); } return (ret); } -uint64_t +static uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset) { uint64_t label_offset; if (l < VDEV_LABELS / 2) label_offset = 0; else label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t); return (offset + l * sizeof (vdev_label_t) + label_offset); } static int vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) { vdev_t vtmp; vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch; vdev_phys_t *tmp_label; spa_t *spa; vdev_t *vdev, *top_vdev, *pool_vdev; off_t off; blkptr_t bp; const unsigned char *nvlist = NULL; uint64_t val; uint64_t guid; uint64_t best_txg = 0; uint64_t pool_txg, pool_guid; uint64_t psize; const char *pool_name; const unsigned char *vdevs; const unsigned char *features; int i, l, rc, is_newer; char *upbuf; const struct uberblock *up; /* * Load the vdev label and figure out which * uberblock is most current. */ memset(&vtmp, 0, sizeof(vtmp)); vtmp.v_phys_read = _read; vtmp.v_read_priv = read_priv; psize = P2ALIGN(ldi_get_size(read_priv), (uint64_t)sizeof (vdev_label_t)); /* Test for minimum pool size. */ if (psize < SPA_MINDEVSIZE) return (EIO); tmp_label = zfs_alloc(sizeof(vdev_phys_t)); for (l = 0; l < VDEV_LABELS; l++) { off = vdev_label_offset(psize, l, offsetof(vdev_label_t, vl_vdev_phys)); BP_ZERO(&bp); BP_SET_LSIZE(&bp, sizeof(vdev_phys_t)); BP_SET_PSIZE(&bp, sizeof(vdev_phys_t)); BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); DVA_SET_OFFSET(BP_IDENTITY(&bp), off); ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0)) continue; if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR) continue; nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4; if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64, NULL, &pool_txg) != 0) continue; if (best_txg <= pool_txg) { best_txg = pool_txg; memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t)); } } zfs_free(tmp_label, sizeof (vdev_phys_t)); if (best_txg == 0) return (EIO); if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) return (EIO); nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4; if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64, NULL, &val) != 0) { return (EIO); } if (!SPA_VERSION_IS_SUPPORTED(val)) { printf("ZFS: unsupported ZFS version %u (should be %u)\n", (unsigned) val, (unsigned) SPA_VERSION); return (EIO); } /* Check ZFS features for read */ if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ, DATA_TYPE_NVLIST, NULL, &features) == 0 && nvlist_check_features_for_read(features) != 0) { return (EIO); } if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64, NULL, &val) != 0) { return (EIO); } if (val == POOL_STATE_DESTROYED) { /* We don't boot only from destroyed pools. */ return (EIO); } if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64, NULL, &pool_txg) != 0 || nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, NULL, &pool_guid) != 0 || nvlist_find(nvlist, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING, NULL, &pool_name) != 0) { /* * Cache and spare devices end up here - just ignore * them. */ /*printf("ZFS: can't find pool details\n");*/ return (EIO); } if (nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL, &val) == 0 && val != 0) { return (EIO); } /* * Create the pool if this is the first time we've seen it. */ spa = spa_find_by_guid(pool_guid); if (spa == NULL) { spa = spa_create(pool_guid, pool_name); if (spa == NULL) return (ENOMEM); } if (pool_txg > spa->spa_txg) { spa->spa_txg = pool_txg; is_newer = 1; } else { is_newer = 0; } /* * Get the vdev tree and create our in-core copy of it. * If we already have a vdev with this guid, this must * be some kind of alias (overlapping slices, dangerously dedicated * disks etc). */ if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, NULL, &guid) != 0) { return (EIO); } vdev = vdev_find(guid); if (vdev && vdev->v_phys_read) /* Has this vdev already been inited? */ return (EIO); if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, NULL, &vdevs)) { return (EIO); } rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer); if (rc != 0) return (rc); /* * Add the toplevel vdev to the pool if its not already there. */ STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink) if (top_vdev == pool_vdev) break; if (!pool_vdev && top_vdev) { top_vdev->spa = spa; STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink); } /* * We should already have created an incomplete vdev for this * vdev. Find it and initialise it with our read proc. */ vdev = vdev_find(guid); if (vdev) { vdev->v_phys_read = _read; vdev->v_read_priv = read_priv; vdev->v_state = VDEV_STATE_HEALTHY; } else { printf("ZFS: inconsistent nvlist contents\n"); return (EIO); } /* * Re-evaluate top-level vdev state. */ vdev_set_state(top_vdev); /* * Ok, we are happy with the pool so far. Lets find * the best uberblock and then we can actually access * the contents of the pool. */ upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev)); up = (const struct uberblock *)upbuf; for (l = 0; l < VDEV_LABELS; l++) { for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdev); i++) { off = vdev_label_offset(psize, l, VDEV_UBERBLOCK_OFFSET(vdev, i)); BP_ZERO(&bp); DVA_SET_OFFSET(&bp.blk_dva[0], off); BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev)); BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev)); BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); if (vdev_read_phys(vdev, &bp, upbuf, off, 0)) continue; if (up->ub_magic != UBERBLOCK_MAGIC) continue; if (up->ub_txg < spa->spa_txg) continue; if (up->ub_txg > spa->spa_uberblock.ub_txg || (up->ub_txg == spa->spa_uberblock.ub_txg && up->ub_timestamp > spa->spa_uberblock.ub_timestamp)) { spa->spa_uberblock = *up; } } } zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev)); vdev->spa = spa; if (spap != NULL) *spap = spa; return (0); } static int ilog2(int n) { int v; for (v = 0; v < 32; v++) if (n == (1 << v)) return v; return -1; } static int zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf) { blkptr_t gbh_bp; zio_gbh_phys_t zio_gb; char *pbuf; int i; /* Artificial BP for gang block header. */ gbh_bp = *bp; BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER); BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF); for (i = 0; i < SPA_DVAS_PER_BP; i++) DVA_SET_GANG(&gbh_bp.blk_dva[i], 0); /* Read gang header block using the artificial BP. */ if (zio_read(spa, &gbh_bp, &zio_gb)) return (EIO); pbuf = buf; for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { blkptr_t *gbp = &zio_gb.zg_blkptr[i]; if (BP_IS_HOLE(gbp)) continue; if (zio_read(spa, gbp, pbuf)) return (EIO); pbuf += BP_GET_PSIZE(gbp); } if (zio_checksum_verify(spa, bp, buf)) return (EIO); return (0); } static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf) { int cpfunc = BP_GET_COMPRESS(bp); uint64_t align, size; void *pbuf; int i, error; /* * Process data embedded in block pointer */ if (BP_IS_EMBEDDED(bp)) { ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); size = BPE_GET_PSIZE(bp); ASSERT(size <= BPE_PAYLOAD_SIZE); if (cpfunc != ZIO_COMPRESS_OFF) pbuf = zfs_alloc(size); else pbuf = buf; decode_embedded_bp_compressed(bp, pbuf); error = 0; if (cpfunc != ZIO_COMPRESS_OFF) { error = zio_decompress_data(cpfunc, pbuf, size, buf, BP_GET_LSIZE(bp)); zfs_free(pbuf, size); } if (error != 0) printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n", error); return (error); } error = EIO; for (i = 0; i < SPA_DVAS_PER_BP; i++) { const dva_t *dva = &bp->blk_dva[i]; vdev_t *vdev; int vdevid; off_t offset; if (!dva->dva_word[0] && !dva->dva_word[1]) continue; vdevid = DVA_GET_VDEV(dva); offset = DVA_GET_OFFSET(dva); STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { if (vdev->v_id == vdevid) break; } if (!vdev || !vdev->v_read) continue; size = BP_GET_PSIZE(bp); if (vdev->v_read == vdev_raidz_read) { align = 1ULL << vdev->v_top->v_ashift; if (P2PHASE(size, align) != 0) size = P2ROUNDUP(size, align); } if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF) pbuf = zfs_alloc(size); else pbuf = buf; if (DVA_GET_GANG(dva)) error = zio_read_gang(spa, bp, pbuf); else error = vdev->v_read(vdev, bp, pbuf, offset, size); if (error == 0) { if (cpfunc != ZIO_COMPRESS_OFF) error = zio_decompress_data(cpfunc, pbuf, BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp)); else if (size != BP_GET_PSIZE(bp)) bcopy(pbuf, buf, BP_GET_PSIZE(bp)); } if (buf != pbuf) zfs_free(pbuf, size); if (error == 0) break; } if (error != 0) printf("ZFS: i/o error - all block copies unavailable\n"); return (error); } static int dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen) { int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT; int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; int nlevels = dnode->dn_nlevels; int i, rc; if (bsize > SPA_MAXBLOCKSIZE) { printf("ZFS: I/O error - blocks larger than %llu are not " "supported\n", SPA_MAXBLOCKSIZE); return (EIO); } /* * Note: bsize may not be a power of two here so we need to do an * actual divide rather than a bitshift. */ while (buflen > 0) { uint64_t bn = offset / bsize; int boff = offset % bsize; int ibn; const blkptr_t *indbp; blkptr_t bp; if (bn > dnode->dn_maxblkid) return (EIO); if (dnode == dnode_cache_obj && bn == dnode_cache_bn) goto cached; indbp = dnode->dn_blkptr; for (i = 0; i < nlevels; i++) { /* * Copy the bp from the indirect array so that * we can re-use the scratch buffer for multi-level * objects. */ ibn = bn >> ((nlevels - i - 1) * ibshift); ibn &= ((1 << ibshift) - 1); bp = indbp[ibn]; if (BP_IS_HOLE(&bp)) { memset(dnode_cache_buf, 0, bsize); break; } rc = zio_read(spa, &bp, dnode_cache_buf); if (rc) return (rc); indbp = (const blkptr_t *) dnode_cache_buf; } dnode_cache_obj = dnode; dnode_cache_bn = bn; cached: /* * The buffer contains our data block. Copy what we * need from it and loop. */ i = bsize - boff; if (i > buflen) i = buflen; memcpy(buf, &dnode_cache_buf[boff], i); buf = ((char*) buf) + i; offset += i; buflen -= i; } return (0); } /* * Lookup a value in a microzap directory. Assumes that the zap * scratch buffer contains the directory contents. */ static int mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value) { const mzap_phys_t *mz; const mzap_ent_phys_t *mze; size_t size; int chunks, i; /* * Microzap objects use exactly one block. Read the whole * thing. */ size = dnode->dn_datablkszsec * 512; mz = (const mzap_phys_t *) zap_scratch; chunks = size / MZAP_ENT_LEN - 1; for (i = 0; i < chunks; i++) { mze = &mz->mz_chunk[i]; if (!strcmp(mze->mze_name, name)) { *value = mze->mze_value; return (0); } } return (ENOENT); } /* * Compare a name with a zap leaf entry. Return non-zero if the name * matches. */ static int fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name) { size_t namelen; const zap_leaf_chunk_t *nc; const char *p; namelen = zc->l_entry.le_name_numints; nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); p = name; while (namelen > 0) { size_t len; len = namelen; if (len > ZAP_LEAF_ARRAY_BYTES) len = ZAP_LEAF_ARRAY_BYTES; if (memcmp(p, nc->l_array.la_array, len)) return (0); p += len; namelen -= len; nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); } return 1; } /* * Extract a uint64_t value from a zap leaf entry. */ static uint64_t fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc) { const zap_leaf_chunk_t *vc; int i; uint64_t value; const uint8_t *p; vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk); for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) { value = (value << 8) | p[i]; } return value; } static void stv(int len, void *addr, uint64_t value) { switch (len) { case 1: *(uint8_t *)addr = value; return; case 2: *(uint16_t *)addr = value; return; case 4: *(uint32_t *)addr = value; return; case 8: *(uint64_t *)addr = value; return; } } /* * Extract a array from a zap leaf entry. */ static void fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, uint64_t integer_size, uint64_t num_integers, void *buf) { uint64_t array_int_len = zc->l_entry.le_value_intlen; uint64_t value = 0; uint64_t *u64 = buf; char *p = buf; int len = MIN(zc->l_entry.le_value_numints, num_integers); int chunk = zc->l_entry.le_value_chunk; int byten = 0; if (integer_size == 8 && len == 1) { *u64 = fzap_leaf_value(zl, zc); return; } while (len > 0) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array; int i; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl)); for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { value = (value << 8) | la->la_array[i]; byten++; if (byten == array_int_len) { stv(integer_size, p, value); byten = 0; len--; if (len == 0) return; p += integer_size; } } chunk = la->la_next; } } /* * Lookup a value in a fatzap directory. Assumes that the zap scratch * buffer contains the directory header. */ static int fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t integer_size, uint64_t num_integers, void *value) { int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; zap_phys_t zh = *(zap_phys_t *) zap_scratch; fat_zap_t z; uint64_t *ptrtbl; uint64_t hash; int rc; if (zh.zap_magic != ZAP_MAGIC) return (EIO); z.zap_block_shift = ilog2(bsize); z.zap_phys = (zap_phys_t *) zap_scratch; /* * Figure out where the pointer table is and read it in if necessary. */ if (zh.zap_ptrtbl.zt_blk) { rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize, zap_scratch, bsize); if (rc) return (rc); ptrtbl = (uint64_t *) zap_scratch; } else { ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0); } hash = zap_hash(zh.zap_salt, name); zap_leaf_t zl; zl.l_bs = z.zap_block_shift; off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs; zap_leaf_chunk_t *zc; rc = dnode_read(spa, dnode, off, zap_scratch, bsize); if (rc) return (rc); zl.l_phys = (zap_leaf_phys_t *) zap_scratch; /* * Make sure this chunk matches our hash. */ if (zl.l_phys->l_hdr.lh_prefix_len > 0 && zl.l_phys->l_hdr.lh_prefix != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len)) return (ENOENT); /* * Hash within the chunk to find our entry. */ int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len); int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1); h = zl.l_phys->l_hash[h]; if (h == 0xffff) return (ENOENT); zc = &ZAP_LEAF_CHUNK(&zl, h); while (zc->l_entry.le_hash != hash) { if (zc->l_entry.le_next == 0xffff) { zc = NULL; break; } zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next); } if (fzap_name_equal(&zl, zc, name)) { if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints > integer_size * num_integers) return (E2BIG); fzap_leaf_array(&zl, zc, integer_size, num_integers, value); return (0); } return (ENOENT); } /* * Lookup a name in a zap object and return its value as a uint64_t. */ static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t integer_size, uint64_t num_integers, void *value) { int rc; uint64_t zap_type; size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; rc = dnode_read(spa, dnode, 0, zap_scratch, size); if (rc) return (rc); zap_type = *(uint64_t *) zap_scratch; if (zap_type == ZBT_MICRO) return mzap_lookup(dnode, name, value); else if (zap_type == ZBT_HEADER) { return fzap_lookup(spa, dnode, name, integer_size, num_integers, value); } printf("ZFS: invalid zap_type=%d\n", (int)zap_type); return (EIO); } /* * List a microzap directory. Assumes that the zap scratch buffer contains * the directory contents. */ static int mzap_list(const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t)) { const mzap_phys_t *mz; const mzap_ent_phys_t *mze; size_t size; int chunks, i, rc; /* * Microzap objects use exactly one block. Read the whole * thing. */ size = dnode->dn_datablkszsec * 512; mz = (const mzap_phys_t *) zap_scratch; chunks = size / MZAP_ENT_LEN - 1; for (i = 0; i < chunks; i++) { mze = &mz->mz_chunk[i]; if (mze->mze_name[0]) { rc = callback(mze->mze_name, mze->mze_value); if (rc != 0) return (rc); } } return (0); } /* * List a fatzap directory. Assumes that the zap scratch buffer contains * the directory header. */ static int fzap_list(const spa_t *spa, const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t)) { int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; zap_phys_t zh = *(zap_phys_t *) zap_scratch; fat_zap_t z; int i, j, rc; if (zh.zap_magic != ZAP_MAGIC) return (EIO); z.zap_block_shift = ilog2(bsize); z.zap_phys = (zap_phys_t *) zap_scratch; /* * This assumes that the leaf blocks start at block 1. The * documentation isn't exactly clear on this. */ zap_leaf_t zl; zl.l_bs = z.zap_block_shift; for (i = 0; i < zh.zap_num_leafs; i++) { off_t off = (i + 1) << zl.l_bs; char name[256], *p; uint64_t value; if (dnode_read(spa, dnode, off, zap_scratch, bsize)) return (EIO); zl.l_phys = (zap_leaf_phys_t *) zap_scratch; for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { zap_leaf_chunk_t *zc, *nc; int namelen; zc = &ZAP_LEAF_CHUNK(&zl, j); if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) continue; namelen = zc->l_entry.le_name_numints; if (namelen > sizeof(name)) namelen = sizeof(name); /* * Paste the name back together. */ nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk); p = name; while (namelen > 0) { int len; len = namelen; if (len > ZAP_LEAF_ARRAY_BYTES) len = ZAP_LEAF_ARRAY_BYTES; memcpy(p, nc->l_array.la_array, len); p += len; namelen -= len; nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next); } /* * Assume the first eight bytes of the value are * a uint64_t. */ value = fzap_leaf_value(&zl, zc); //printf("%s 0x%jx\n", name, (uintmax_t)value); rc = callback((const char *)name, value); if (rc != 0) return (rc); } } return (0); } static int zfs_printf(const char *name, uint64_t value __unused) { printf("%s\n", name); return (0); } /* * List a zap directory. */ static int zap_list(const spa_t *spa, const dnode_phys_t *dnode) { uint64_t zap_type; size_t size = dnode->dn_datablkszsec * 512; if (dnode_read(spa, dnode, 0, zap_scratch, size)) return (EIO); zap_type = *(uint64_t *) zap_scratch; if (zap_type == ZBT_MICRO) return mzap_list(dnode, zfs_printf); else return fzap_list(spa, dnode, zfs_printf); } static int objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode) { off_t offset; offset = objnum * sizeof(dnode_phys_t); return dnode_read(spa, &os->os_meta_dnode, offset, dnode, sizeof(dnode_phys_t)); } static int mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value) { const mzap_phys_t *mz; const mzap_ent_phys_t *mze; size_t size; int chunks, i; /* * Microzap objects use exactly one block. Read the whole * thing. */ size = dnode->dn_datablkszsec * 512; mz = (const mzap_phys_t *) zap_scratch; chunks = size / MZAP_ENT_LEN - 1; for (i = 0; i < chunks; i++) { mze = &mz->mz_chunk[i]; if (value == mze->mze_value) { strcpy(name, mze->mze_name); return (0); } } return (ENOENT); } static void fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name) { size_t namelen; const zap_leaf_chunk_t *nc; char *p; namelen = zc->l_entry.le_name_numints; nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); p = name; while (namelen > 0) { size_t len; len = namelen; if (len > ZAP_LEAF_ARRAY_BYTES) len = ZAP_LEAF_ARRAY_BYTES; memcpy(p, nc->l_array.la_array, len); p += len; namelen -= len; nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); } *p = '\0'; } static int fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value) { int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; zap_phys_t zh = *(zap_phys_t *) zap_scratch; fat_zap_t z; int i, j; if (zh.zap_magic != ZAP_MAGIC) return (EIO); z.zap_block_shift = ilog2(bsize); z.zap_phys = (zap_phys_t *) zap_scratch; /* * This assumes that the leaf blocks start at block 1. The * documentation isn't exactly clear on this. */ zap_leaf_t zl; zl.l_bs = z.zap_block_shift; for (i = 0; i < zh.zap_num_leafs; i++) { off_t off = (i + 1) << zl.l_bs; if (dnode_read(spa, dnode, off, zap_scratch, bsize)) return (EIO); zl.l_phys = (zap_leaf_phys_t *) zap_scratch; for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { zap_leaf_chunk_t *zc; zc = &ZAP_LEAF_CHUNK(&zl, j); if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) continue; if (zc->l_entry.le_value_intlen != 8 || zc->l_entry.le_value_numints != 1) continue; if (fzap_leaf_value(&zl, zc) == value) { fzap_name_copy(&zl, zc, name); return (0); } } } return (ENOENT); } static int zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value) { int rc; uint64_t zap_type; size_t size = dnode->dn_datablkszsec * 512; rc = dnode_read(spa, dnode, 0, zap_scratch, size); if (rc) return (rc); zap_type = *(uint64_t *) zap_scratch; if (zap_type == ZBT_MICRO) return mzap_rlookup(spa, dnode, name, value); else return fzap_rlookup(spa, dnode, name, value); } static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result) { char name[256]; char component[256]; uint64_t dir_obj, parent_obj, child_dir_zapobj; dnode_phys_t child_dir_zap, dataset, dir, parent; dsl_dir_phys_t *dd; dsl_dataset_phys_t *ds; char *p; int len; p = &name[sizeof(name) - 1]; *p = '\0'; if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) { printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); return (EIO); } ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; dir_obj = ds->ds_dir_obj; for (;;) { if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0) return (EIO); dd = (dsl_dir_phys_t *)&dir.dn_bonus; /* Actual loop condition. */ parent_obj = dd->dd_parent_obj; if (parent_obj == 0) break; if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0) return (EIO); dd = (dsl_dir_phys_t *)&parent.dn_bonus; child_dir_zapobj = dd->dd_child_dir_zapobj; if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) return (EIO); if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0) return (EIO); len = strlen(component); p -= len; memcpy(p, component, len); --p; *p = '/'; /* Actual loop iteration. */ dir_obj = parent_obj; } if (*p != '\0') ++p; strcpy(result, p); return (0); } static int zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum) { char element[256]; uint64_t dir_obj, child_dir_zapobj; dnode_phys_t child_dir_zap, dir; dsl_dir_phys_t *dd; const char *p, *q; if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) return (EIO); if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj), 1, &dir_obj)) return (EIO); p = name; for (;;) { if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) return (EIO); dd = (dsl_dir_phys_t *)&dir.dn_bonus; while (*p == '/') p++; /* Actual loop condition #1. */ if (*p == '\0') break; q = strchr(p, '/'); if (q) { memcpy(element, p, q - p); element[q - p] = '\0'; p = q + 1; } else { strcpy(element, p); p += strlen(p); } child_dir_zapobj = dd->dd_child_dir_zapobj; if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) return (EIO); /* Actual loop condition #2. */ if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj), 1, &dir_obj) != 0) return (ENOENT); } *objnum = dd->dd_head_dataset_obj; return (0); } #ifndef BOOT2 static int zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/) { uint64_t dir_obj, child_dir_zapobj; dnode_phys_t child_dir_zap, dir, dataset; dsl_dataset_phys_t *ds; dsl_dir_phys_t *dd; if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) { printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); return (EIO); } ds = (dsl_dataset_phys_t *) &dataset.dn_bonus; dir_obj = ds->ds_dir_obj; if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) { printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj); return (EIO); } dd = (dsl_dir_phys_t *)&dir.dn_bonus; child_dir_zapobj = dd->dd_child_dir_zapobj; if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) { printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj); return (EIO); } return (zap_list(spa, &child_dir_zap) != 0); } int zfs_callback_dataset(const spa_t *spa, uint64_t objnum, int (*callback)(const char *, uint64_t)) { uint64_t dir_obj, child_dir_zapobj, zap_type; dnode_phys_t child_dir_zap, dir, dataset; dsl_dataset_phys_t *ds; dsl_dir_phys_t *dd; int err; err = objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset); if (err != 0) { printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); return (err); } ds = (dsl_dataset_phys_t *) &dataset.dn_bonus; dir_obj = ds->ds_dir_obj; err = objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir); if (err != 0) { printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj); return (err); } dd = (dsl_dir_phys_t *)&dir.dn_bonus; child_dir_zapobj = dd->dd_child_dir_zapobj; err = objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap); if (err != 0) { printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj); return (err); } err = dnode_read(spa, &child_dir_zap, 0, zap_scratch, child_dir_zap.dn_datablkszsec * 512); if (err != 0) return (err); zap_type = *(uint64_t *) zap_scratch; if (zap_type == ZBT_MICRO) return mzap_list(&child_dir_zap, callback); else return fzap_list(spa, &child_dir_zap, callback); } #endif /* * Find the object set given the object number of its dataset object * and return its details in *objset */ static int zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset) { dnode_phys_t dataset; dsl_dataset_phys_t *ds; if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) { printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); return (EIO); } ds = (dsl_dataset_phys_t *) &dataset.dn_bonus; if (zio_read(spa, &ds->ds_bp, objset)) { printf("ZFS: can't read object set for dataset %ju\n", (uintmax_t)objnum); return (EIO); } return (0); } /* * Find the object set pointed to by the BOOTFS property or the root * dataset if there is none and return its details in *objset */ static int zfs_get_root(const spa_t *spa, uint64_t *objid) { dnode_phys_t dir, propdir; uint64_t props, bootfs, root; *objid = 0; /* * Start with the MOS directory object. */ if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) { printf("ZFS: can't read MOS object directory\n"); return (EIO); } /* * Lookup the pool_props and see if we can find a bootfs. */ if (zap_lookup(spa, &dir, DMU_POOL_PROPS, sizeof (props), 1, &props) == 0 && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0 && zap_lookup(spa, &propdir, "bootfs", sizeof (bootfs), 1, &bootfs) == 0 && bootfs != 0) { *objid = bootfs; return (0); } /* * Lookup the root dataset directory */ if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (root), 1, &root) || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) { printf("ZFS: can't find root dsl_dir\n"); return (EIO); } /* * Use the information from the dataset directory's bonus buffer * to find the dataset object and from that the object set itself. */ dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus; *objid = dd->dd_head_dataset_obj; return (0); } static int zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount) { mount->spa = spa; /* * Find the root object set if not explicitly provided */ if (rootobj == 0 && zfs_get_root(spa, &rootobj)) { printf("ZFS: can't find root filesystem\n"); return (EIO); } if (zfs_mount_dataset(spa, rootobj, &mount->objset)) { printf("ZFS: can't open root filesystem\n"); return (EIO); } mount->rootobj = rootobj; return (0); } /* * callback function for feature name checks. */ static int check_feature(const char *name, uint64_t value) { int i; if (value == 0) return (0); if (name[0] == '\0') return (0); for (i = 0; features_for_read[i] != NULL; i++) { if (strcmp(name, features_for_read[i]) == 0) return (0); } printf("ZFS: unsupported feature: %s\n", name); return (EIO); } /* * Checks whether the MOS features that are active are supported. */ static int check_mos_features(const spa_t *spa) { dnode_phys_t dir; uint64_t objnum, zap_type; size_t size; int rc; if ((rc = objset_get_dnode(spa, &spa->spa_mos, DMU_OT_OBJECT_DIRECTORY, &dir)) != 0) return (rc); if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ, sizeof (objnum), 1, &objnum)) != 0) { /* * It is older pool without features. As we have already * tested the label, just return without raising the error. */ return (0); } if ((rc = objset_get_dnode(spa, &spa->spa_mos, objnum, &dir)) != 0) return (rc); if (dir.dn_type != DMU_OTN_ZAP_METADATA) return (EIO); size = dir.dn_datablkszsec * 512; if (dnode_read(spa, &dir, 0, zap_scratch, size)) return (EIO); zap_type = *(uint64_t *) zap_scratch; if (zap_type == ZBT_MICRO) rc = mzap_list(&dir, check_feature); else rc = fzap_list(spa, &dir, check_feature); return (rc); } static int zfs_spa_init(spa_t *spa) { dnode_phys_t dir; int rc; if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) { printf("ZFS: can't read MOS of pool %s\n", spa->spa_name); return (EIO); } if (spa->spa_mos.os_type != DMU_OST_META) { printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name); return (EIO); } if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) { printf("ZFS: failed to read pool %s directory object\n", spa->spa_name); return (EIO); } /* this is allowed to fail, older pools do not have salt */ rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes); rc = check_mos_features(spa); if (rc != 0) { printf("ZFS: pool %s is not supported\n", spa->spa_name); } return (rc); } static int zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb) { if (dn->dn_bonustype != DMU_OT_SA) { znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus; sb->st_mode = zp->zp_mode; sb->st_uid = zp->zp_uid; sb->st_gid = zp->zp_gid; sb->st_size = zp->zp_size; } else { sa_hdr_phys_t *sahdrp; int hdrsize; size_t size = 0; void *buf = NULL; if (dn->dn_bonuslen != 0) sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn); else { if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) { blkptr_t *bp = DN_SPILL_BLKPTR(dn); int error; size = BP_GET_LSIZE(bp); buf = zfs_alloc(size); error = zio_read(spa, bp, buf); if (error != 0) { zfs_free(buf, size); return (error); } sahdrp = buf; } else { return (EIO); } } hdrsize = SA_HDR_SIZE(sahdrp); sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize + SA_MODE_OFFSET); sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize + SA_UID_OFFSET); sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize + SA_GID_OFFSET); sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize + SA_SIZE_OFFSET); if (buf != NULL) zfs_free(buf, size); } return (0); } static int zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize) { int rc = 0; if (dn->dn_bonustype == DMU_OT_SA) { sa_hdr_phys_t *sahdrp = NULL; size_t size = 0; void *buf = NULL; int hdrsize; char *p; if (dn->dn_bonuslen != 0) sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn); else { blkptr_t *bp; if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0) return (EIO); bp = DN_SPILL_BLKPTR(dn); size = BP_GET_LSIZE(bp); buf = zfs_alloc(size); rc = zio_read(spa, bp, buf); if (rc != 0) { zfs_free(buf, size); return (rc); } sahdrp = buf; } hdrsize = SA_HDR_SIZE(sahdrp); p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET); memcpy(path, p, psize); if (buf != NULL) zfs_free(buf, size); return (0); } /* * Second test is purely to silence bogus compiler * warning about accessing past the end of dn_bonus. */ if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen && sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) { memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize); } else { rc = dnode_read(spa, dn, 0, path, psize); } return (rc); } struct obj_list { uint64_t objnum; STAILQ_ENTRY(obj_list) entry; }; /* * Lookup a file and return its dnode. */ static int zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode) { int rc; uint64_t objnum; const spa_t *spa; dnode_phys_t dn; const char *p, *q; char element[256]; char path[1024]; int symlinks_followed = 0; struct stat sb; struct obj_list *entry, *tentry; STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache); spa = mount->spa; if (mount->objset.os_type != DMU_OST_ZFS) { printf("ZFS: unexpected object set type %ju\n", (uintmax_t)mount->objset.os_type); return (EIO); } if ((entry = malloc(sizeof(struct obj_list))) == NULL) return (ENOMEM); /* * Get the root directory dnode. */ rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn); if (rc) { free(entry); return (rc); } rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof (objnum), 1, &objnum); if (rc) { free(entry); return (rc); } entry->objnum = objnum; STAILQ_INSERT_HEAD(&on_cache, entry, entry); rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); if (rc != 0) goto done; p = upath; while (p && *p) { rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); if (rc != 0) goto done; while (*p == '/') p++; if (*p == '\0') break; q = p; while (*q != '\0' && *q != '/') q++; /* skip dot */ if (p + 1 == q && p[0] == '.') { p++; continue; } /* double dot */ if (p + 2 == q && p[0] == '.' && p[1] == '.') { p += 2; if (STAILQ_FIRST(&on_cache) == STAILQ_LAST(&on_cache, obj_list, entry)) { rc = ENOENT; goto done; } entry = STAILQ_FIRST(&on_cache); STAILQ_REMOVE_HEAD(&on_cache, entry); free(entry); objnum = (STAILQ_FIRST(&on_cache))->objnum; continue; } if (q - p + 1 > sizeof(element)) { rc = ENAMETOOLONG; goto done; } memcpy(element, p, q - p); element[q - p] = 0; p = q; if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0) goto done; if (!S_ISDIR(sb.st_mode)) { rc = ENOTDIR; goto done; } rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum); if (rc) goto done; objnum = ZFS_DIRENT_OBJ(objnum); if ((entry = malloc(sizeof(struct obj_list))) == NULL) { rc = ENOMEM; goto done; } entry->objnum = objnum; STAILQ_INSERT_HEAD(&on_cache, entry, entry); rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); if (rc) goto done; /* * Check for symlink. */ rc = zfs_dnode_stat(spa, &dn, &sb); if (rc) goto done; if (S_ISLNK(sb.st_mode)) { if (symlinks_followed > 10) { rc = EMLINK; goto done; } symlinks_followed++; /* * Read the link value and copy the tail of our * current path onto the end. */ if (sb.st_size + strlen(p) + 1 > sizeof(path)) { rc = ENAMETOOLONG; goto done; } strcpy(&path[sb.st_size], p); rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size); if (rc != 0) goto done; /* * Restart with the new path, starting either at * the root or at the parent depending whether or * not the link is relative. */ p = path; if (*p == '/') { while (STAILQ_FIRST(&on_cache) != STAILQ_LAST(&on_cache, obj_list, entry)) { entry = STAILQ_FIRST(&on_cache); STAILQ_REMOVE_HEAD(&on_cache, entry); free(entry); } } else { entry = STAILQ_FIRST(&on_cache); STAILQ_REMOVE_HEAD(&on_cache, entry); free(entry); } objnum = (STAILQ_FIRST(&on_cache))->objnum; } } *dnode = dn; done: STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry) free(entry); return (rc); } Index: head/sys/cddl/boot/zfs/zfssubr.c =================================================================== --- head/sys/cddl/boot/zfs/zfssubr.c (revision 326447) +++ head/sys/cddl/boot/zfs/zfssubr.c (revision 326448) @@ -1,1787 +1,1787 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include __FBSDID("$FreeBSD$"); static uint64_t zfs_crc64_table[256]; #define ECKSUM 666 #define ASSERT3S(x, y, z) ((void)0) #define ASSERT3U(x, y, z) ((void)0) #define ASSERT3P(x, y, z) ((void)0) #define ASSERT0(x) ((void)0) #define ASSERT(x) ((void)0) #define panic(...) do { \ printf(__VA_ARGS__); \ for (;;) ; \ } while (0) #define kmem_alloc(size, flag) zfs_alloc((size)) #define kmem_free(ptr, size) zfs_free((ptr), (size)) static void zfs_init_crc(void) { int i, j; uint64_t *ct; /* * Calculate the crc64 table (used for the zap hash * function). */ if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); } } static void zio_checksum_off(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } /* * Signature for checksum functions. */ typedef void zio_checksum_t(const void *data, uint64_t size, const void *ctx_template, zio_cksum_t *zcp); typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); typedef void zio_checksum_tmpl_free_t(void *ctx_template); typedef enum zio_checksum_flags { /* Strong enough for metadata? */ ZCHECKSUM_FLAG_METADATA = (1 << 1), /* ZIO embedded checksum */ ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), /* Strong enough for dedup (without verification)? */ ZCHECKSUM_FLAG_DEDUP = (1 << 3), /* Uses salt value */ ZCHECKSUM_FLAG_SALTED = (1 << 4), /* Strong enough for nopwrite? */ ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) } zio_checksum_flags_t; /* * Information about each checksum function. */ typedef struct zio_checksum_info { /* checksum function for each byteorder */ zio_checksum_t *ci_func[2]; zio_checksum_tmpl_init_t *ci_tmpl_init; zio_checksum_tmpl_free_t *ci_tmpl_free; zio_checksum_flags_t ci_flags; const char *ci_name; /* descriptive name */ } zio_checksum_info_t; #include "blkptr.c" #include "fletcher.c" #include "sha256.c" #include "skein_zfs.c" static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{NULL, NULL}, NULL, NULL, 0, "inherit"}, {{NULL, NULL}, NULL, NULL, 0, "on"}, {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 0, "fletcher2"}, {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "noparity"}, {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, /* no edonr for now */ {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"} }; /* * Common signature for all zio compress/decompress functions. */ typedef size_t zio_compress_func_t(void *src, void *dst, size_t s_len, size_t d_len, int); typedef int zio_decompress_func_t(void *src, void *dst, size_t s_len, size_t d_len, int); /* * Information about each compression function. */ typedef struct zio_compress_info { zio_compress_func_t *ci_compress; /* compression function */ zio_decompress_func_t *ci_decompress; /* decompression function */ int ci_level; /* level parameter */ const char *ci_name; /* algorithm name */ } zio_compress_info_t; #include "lzjb.c" #include "zle.c" #include "lz4.c" /* * Compression vectors. */ static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { {NULL, NULL, 0, "inherit"}, {NULL, NULL, 0, "on"}, {NULL, NULL, 0, "uncompressed"}, {NULL, lzjb_decompress, 0, "lzjb"}, {NULL, NULL, 0, "empty"}, {NULL, NULL, 1, "gzip-1"}, {NULL, NULL, 2, "gzip-2"}, {NULL, NULL, 3, "gzip-3"}, {NULL, NULL, 4, "gzip-4"}, {NULL, NULL, 5, "gzip-5"}, {NULL, NULL, 6, "gzip-6"}, {NULL, NULL, 7, "gzip-7"}, {NULL, NULL, 8, "gzip-8"}, {NULL, NULL, 9, "gzip-9"}, {NULL, zle_decompress, 64, "zle"}, {NULL, lz4_decompress, 0, "lz4"}, }; static void byteswap_uint64_array(void *vbuf, size_t size) { uint64_t *buf = vbuf; size_t count = size >> 3; int i; ASSERT((size & 7) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_64(buf[i]); } /* * Set the external verifier for a gang block based on , * a tuple which is guaranteed to be unique for the life of the pool. */ static void zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t txg = BP_PHYSICAL_BIRTH(bp); ASSERT(BP_IS_GANG(bp)); ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); } /* * Set the external verifier for a label block based on its offset. * The vdev is implicit, and the txg is unknowable at pool open time -- * hence the logic in vdev_uberblock_load() to find the most recent copy. */ static void zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) { ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); } /* * Calls the template init function of a checksum which supports context * templates and installs the template into the spa_t. */ static void zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; if (ci->ci_tmpl_init == NULL) return; if (spa->spa_cksum_tmpls[checksum] != NULL) return; if (spa->spa_cksum_tmpls[checksum] == NULL) { spa->spa_cksum_tmpls[checksum] = ci->ci_tmpl_init(&spa->spa_cksum_salt); } } /* * Called by a spa_t that's about to be deallocated. This steps through * all of the checksum context templates and deallocates any that were * initialized using the algorithm-specific template init function. */ -void +static void zio_checksum_templates_free(spa_t *spa) { for (enum zio_checksum checksum = 0; checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { if (spa->spa_cksum_tmpls[checksum] != NULL) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); spa->spa_cksum_tmpls[checksum] = NULL; } } } static int zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) { uint64_t size; unsigned int checksum; zio_checksum_info_t *ci; void *ctx = NULL; zio_cksum_t actual_cksum, expected_cksum, verifier; int byteswap; checksum = BP_GET_CHECKSUM(bp); size = BP_GET_PSIZE(bp); if (checksum >= ZIO_CHECKSUM_FUNCTIONS) return (EINVAL); ci = &zio_checksum_table[checksum]; if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) return (EINVAL); if (spa != NULL) { zio_checksum_template_init(checksum, (spa_t *) spa); ctx = spa->spa_cksum_tmpls[checksum]; } if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || checksum == ZIO_CHECKSUM_LABEL); eck = (zio_eck_t *)((char *)data + size) - 1; if (checksum == ZIO_CHECKSUM_GANG_HEADER) zio_checksum_gang_verifier(&verifier, bp); else if (checksum == ZIO_CHECKSUM_LABEL) zio_checksum_label_verifier(&verifier, DVA_GET_OFFSET(BP_IDENTITY(bp))); else verifier = bp->blk_cksum; byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); expected_cksum = eck->zec_cksum; eck->zec_cksum = verifier; ci->ci_func[byteswap](data, size, ctx, &actual_cksum); eck->zec_cksum = expected_cksum; if (byteswap) byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); } else { expected_cksum = bp->blk_cksum; ci->ci_func[0](data, size, ctx, &actual_cksum); } if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/ return (EIO); } return (0); } static int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, void *dest, uint64_t destsize) { zio_compress_info_t *ci; if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { printf("ZFS: unsupported compression algorithm %u\n", cpfunc); return (EIO); } ci = &zio_compress_table[cpfunc]; if (!ci->ci_decompress) { printf("ZFS: unsupported compression algorithm %s\n", ci->ci_name); return (EIO); } return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); } static uint64_t zap_hash(uint64_t salt, const char *name) { const uint8_t *cp; uint8_t c; uint64_t crc = salt; ASSERT(crc != 0); ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; /* * Only use 28 bits, since we need 4 bits in the cookie for the * collision differentiator. We MUST use the high bits, since * those are the onces that we first pay attention to when * chosing the bucket. */ crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); return (crc); } static void *zfs_alloc(size_t size); static void zfs_free(void *ptr, size_t size); typedef struct raidz_col { uint64_t rc_devidx; /* child device index for I/O */ uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ void *rc_data; /* I/O data */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ uint8_t rc_skipped; /* Did we skip this I/O column? */ } raidz_col_t; typedef struct raidz_map { uint64_t rm_cols; /* Regular column count */ uint64_t rm_scols; /* Count including skipped columns */ uint64_t rm_bigcols; /* Number of oversized columns */ uint64_t rm_asize; /* Actual total I/O size */ uint64_t rm_missingdata; /* Count of missing data devices */ uint64_t rm_missingparity; /* Count of missing parity devices */ uint64_t rm_firstdatacol; /* First data column/parity count */ uint64_t rm_nskip; /* Skipped sectors for padding */ uint64_t rm_skipstart; /* Column index of padding start */ uintptr_t rm_reports; /* # of referencing checksum reports */ uint8_t rm_freed; /* map no longer has referencing ZIO */ uint8_t rm_ecksuminjected; /* checksum error was injected */ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ } raidz_map_t; #define VDEV_RAIDZ_P 0 #define VDEV_RAIDZ_Q 1 #define VDEV_RAIDZ_R 2 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) /* * We provide a mechanism to perform the field multiplication operation on a * 64-bit value all at once rather than a byte at a time. This works by * creating a mask from the top bit in each byte and using that to * conditionally apply the XOR of 0x1d. */ #define VDEV_RAIDZ_64MUL_2(x, mask) \ { \ (mask) = (x) & 0x8080808080808080ULL; \ (mask) = ((mask) << 1) - ((mask) >> 7); \ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ } #define VDEV_RAIDZ_64MUL_4(x, mask) \ { \ VDEV_RAIDZ_64MUL_2((x), mask); \ VDEV_RAIDZ_64MUL_2((x), mask); \ } /* * These two tables represent powers and logs of 2 in the Galois field defined * above. These values were computed by repeatedly multiplying by 2 as above. */ static const uint8_t vdev_raidz_pow2[256] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 }; static const uint8_t vdev_raidz_log2[256] = { 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, }; /* * Multiply a given number by 2 raised to the given power. */ static uint8_t vdev_raidz_exp2(uint8_t a, int exp) { if (a == 0) return (0); ASSERT(exp >= 0); ASSERT(vdev_raidz_log2[a] > 0 || a == 1); exp += vdev_raidz_log2[a]; if (exp > 255) exp -= 255; return (vdev_raidz_pow2[exp]); } static void vdev_raidz_generate_parity_p(raidz_map_t *rm) { uint64_t *p, *src, pcount, ccount, i; int c; pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; p = rm->rm_col[VDEV_RAIDZ_P].rc_data; ccount = rm->rm_col[c].rc_size / sizeof (src[0]); if (c == rm->rm_firstdatacol) { ASSERT(ccount == pcount); for (i = 0; i < ccount; i++, src++, p++) { *p = *src; } } else { ASSERT(ccount <= pcount); for (i = 0; i < ccount; i++, src++, p++) { *p ^= *src; } } } } static void vdev_raidz_generate_parity_pq(raidz_map_t *rm) { uint64_t *p, *q, *src, pcnt, ccnt, mask, i; int c; pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; p = rm->rm_col[VDEV_RAIDZ_P].rc_data; q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); if (c == rm->rm_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); for (i = 0; i < ccnt; i++, src++, p++, q++) { *p = *src; *q = *src; } for (; i < pcnt; i++, src++, p++, q++) { *p = 0; *q = 0; } } else { ASSERT(ccnt <= pcnt); /* * Apply the algorithm described above by multiplying * the previous result and adding in the new value. */ for (i = 0; i < ccnt; i++, src++, p++, q++) { *p ^= *src; VDEV_RAIDZ_64MUL_2(*q, mask); *q ^= *src; } /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ for (; i < pcnt; i++, q++) { VDEV_RAIDZ_64MUL_2(*q, mask); } } } } static void vdev_raidz_generate_parity_pqr(raidz_map_t *rm) { uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; int c; pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_R].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; p = rm->rm_col[VDEV_RAIDZ_P].rc_data; q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; r = rm->rm_col[VDEV_RAIDZ_R].rc_data; ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); if (c == rm->rm_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { *p = *src; *q = *src; *r = *src; } for (; i < pcnt; i++, src++, p++, q++, r++) { *p = 0; *q = 0; *r = 0; } } else { ASSERT(ccnt <= pcnt); /* * Apply the algorithm described above by multiplying * the previous result and adding in the new value. */ for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { *p ^= *src; VDEV_RAIDZ_64MUL_2(*q, mask); *q ^= *src; VDEV_RAIDZ_64MUL_4(*r, mask); *r ^= *src; } /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ for (; i < pcnt; i++, q++, r++) { VDEV_RAIDZ_64MUL_2(*q, mask); VDEV_RAIDZ_64MUL_4(*r, mask); } } } } /* * Generate RAID parity in the first virtual columns according to the number of * parity columns available. */ static void vdev_raidz_generate_parity(raidz_map_t *rm) { switch (rm->rm_firstdatacol) { case 1: vdev_raidz_generate_parity_p(rm); break; case 2: vdev_raidz_generate_parity_pq(rm); break; case 3: vdev_raidz_generate_parity_pqr(rm); break; default: panic("invalid RAID-Z configuration"); } } /* BEGIN CSTYLED */ /* * In the general case of reconstruction, we must solve the system of linear * equations defined by the coeffecients used to generate parity as well as * the contents of the data and parity disks. This can be expressed with * vectors for the original data (D) and the actual data (d) and parity (p) * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): * * __ __ __ __ * | | __ __ | p_0 | * | V | | D_0 | | p_m-1 | * | | x | : | = | d_0 | * | I | | D_n-1 | | : | * | | ~~ ~~ | d_n-1 | * ~~ ~~ ~~ ~~ * * I is simply a square identity matrix of size n, and V is a vandermonde * matrix defined by the coeffecients we chose for the various parity columns * (1, 2, 4). Note that these values were chosen both for simplicity, speedy * computation as well as linear separability. * * __ __ __ __ * | 1 .. 1 1 1 | | p_0 | * | 2^n-1 .. 4 2 1 | __ __ | : | * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | * | 1 .. 0 0 0 | | D_1 | | d_0 | * | 0 .. 0 0 0 | x | D_2 | = | d_1 | * | : : : : | | : | | d_2 | * | 0 .. 1 0 0 | | D_n-1 | | : | * | 0 .. 0 1 0 | ~~ ~~ | : | * | 0 .. 0 0 1 | | d_n-1 | * ~~ ~~ ~~ ~~ * * Note that I, V, d, and p are known. To compute D, we must invert the * matrix and use the known data and parity values to reconstruct the unknown * data values. We begin by removing the rows in V|I and d|p that correspond * to failed or missing columns; we then make V|I square (n x n) and d|p * sized n by removing rows corresponding to unused parity from the bottom up * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' * using Gauss-Jordan elimination. In the example below we use m=3 parity * columns, n=8 data columns, with errors in d_1, d_2, and p_1: * __ __ * | 1 1 1 1 1 1 1 1 | * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks * | 19 205 116 29 64 16 4 1 | / / * | 1 0 0 0 0 0 0 0 | / / * | 0 1 0 0 0 0 0 0 | <--' / * (V|I) = | 0 0 1 0 0 0 0 0 | <---' * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 1 1 1 1 1 1 1 | * | 128 64 32 16 8 4 2 1 | * | 19 205 116 29 64 16 4 1 | * | 1 0 0 0 0 0 0 0 | * | 0 1 0 0 0 0 0 0 | * (V|I)' = | 0 0 1 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We * have carefully chosen the seed values 1, 2, and 4 to ensure that this * matrix is not singular. * __ __ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 0 0 1 0 0 0 0 0 | * | 167 100 5 41 159 169 217 208 | * | 166 100 4 40 158 168 216 209 | * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values * of the missing data. * * As is apparent from the example above, the only non-trivial rows in the * inverse matrix correspond to the data disks that we're trying to * reconstruct. Indeed, those are the only rows we need as the others would * only be useful for reconstructing data known or assumed to be valid. For * that reason, we only build the coefficients in the rows that correspond to * targeted columns. */ /* END CSTYLED */ static void vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, uint8_t **rows) { int i, j; int pow; ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); /* * Fill in the missing rows of interest. */ for (i = 0; i < nmap; i++) { ASSERT3S(0, <=, map[i]); ASSERT3S(map[i], <=, 2); pow = map[i] * n; if (pow > 255) pow -= 255; ASSERT(pow <= 255); for (j = 0; j < n; j++) { pow -= map[i]; if (pow < 0) pow += 255; rows[i][j] = vdev_raidz_pow2[pow]; } } } static void vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, uint8_t **rows, uint8_t **invrows, const uint8_t *used) { int i, j, ii, jj; uint8_t log; /* * Assert that the first nmissing entries from the array of used * columns correspond to parity columns and that subsequent entries * correspond to data columns. */ for (i = 0; i < nmissing; i++) { ASSERT3S(used[i], <, rm->rm_firstdatacol); } for (; i < n; i++) { ASSERT3S(used[i], >=, rm->rm_firstdatacol); } /* * First initialize the storage where we'll compute the inverse rows. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { invrows[i][j] = (i == j) ? 1 : 0; } } /* * Subtract all trivial rows from the rows of consequence. */ for (i = 0; i < nmissing; i++) { for (j = nmissing; j < n; j++) { ASSERT3U(used[j], >=, rm->rm_firstdatacol); jj = used[j] - rm->rm_firstdatacol; ASSERT3S(jj, <, n); invrows[i][j] = rows[i][jj]; rows[i][jj] = 0; } } /* * For each of the rows of interest, we must normalize it and subtract * a multiple of it from the other rows. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < missing[i]; j++) { ASSERT3U(rows[i][j], ==, 0); } ASSERT3U(rows[i][missing[i]], !=, 0); /* * Compute the inverse of the first element and multiply each * element in the row by that value. */ log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; for (j = 0; j < n; j++) { rows[i][j] = vdev_raidz_exp2(rows[i][j], log); invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); } for (ii = 0; ii < nmissing; ii++) { if (i == ii) continue; ASSERT3U(rows[ii][missing[i]], !=, 0); log = vdev_raidz_log2[rows[ii][missing[i]]]; for (j = 0; j < n; j++) { rows[ii][j] ^= vdev_raidz_exp2(rows[i][j], log); invrows[ii][j] ^= vdev_raidz_exp2(invrows[i][j], log); } } } /* * Verify that the data that is left in the rows are properly part of * an identity matrix. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { if (j == missing[i]) { ASSERT3U(rows[i][j], ==, 1); } else { ASSERT3U(rows[i][j], ==, 0); } } } } static void vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, int *missing, uint8_t **invrows, const uint8_t *used) { int i, j, x, cc, c; uint8_t *src; uint64_t ccount; uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; uint8_t log, val; int ll; uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; size_t psize; log = 0; /* gcc */ psize = sizeof (invlog[0][0]) * n * nmissing; p = zfs_alloc(psize); for (pp = p, i = 0; i < nmissing; i++) { invlog[i] = pp; pp += n; } for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { ASSERT3U(invrows[i][j], !=, 0); invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; } } for (i = 0; i < n; i++) { c = used[i]; ASSERT3U(c, <, rm->rm_cols); src = rm->rm_col[c].rc_data; ccount = rm->rm_col[c].rc_size; for (j = 0; j < nmissing; j++) { cc = missing[j] + rm->rm_firstdatacol; ASSERT3U(cc, >=, rm->rm_firstdatacol); ASSERT3U(cc, <, rm->rm_cols); ASSERT3U(cc, !=, c); dst[j] = rm->rm_col[cc].rc_data; dcount[j] = rm->rm_col[cc].rc_size; } ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); for (x = 0; x < ccount; x++, src++) { if (*src != 0) log = vdev_raidz_log2[*src]; for (cc = 0; cc < nmissing; cc++) { if (x >= dcount[cc]) continue; if (*src == 0) { val = 0; } else { if ((ll = log + invlog[cc][i]) >= 255) ll -= 255; val = vdev_raidz_pow2[ll]; } if (i == 0) dst[cc][x] = val; else dst[cc][x] ^= val; } } } zfs_free(p, psize); } static int vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) { int n, i, c, t, tt; int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; size_t psize; uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; int code = 0; n = rm->rm_cols - rm->rm_firstdatacol; /* * Figure out which data columns are missing. */ nmissing_rows = 0; for (t = 0; t < ntgts; t++) { if (tgts[t] >= rm->rm_firstdatacol) { missing_rows[nmissing_rows++] = tgts[t] - rm->rm_firstdatacol; } } /* * Figure out which parity columns to use to help generate the missing * data columns. */ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { ASSERT(tt < ntgts); ASSERT(c < rm->rm_firstdatacol); /* * Skip any targeted parity columns. */ if (c == tgts[tt]) { tt++; continue; } code |= 1 << c; parity_map[i] = c; i++; } ASSERT(code != 0); ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * nmissing_rows * n + sizeof (used[0]) * n; p = kmem_alloc(psize, KM_SLEEP); for (pp = p, i = 0; i < nmissing_rows; i++) { rows[i] = pp; pp += n; invrows[i] = pp; pp += n; } used = pp; for (i = 0; i < nmissing_rows; i++) { used[i] = parity_map[i]; } for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { if (tt < nmissing_rows && c == missing_rows[tt] + rm->rm_firstdatacol) { tt++; continue; } ASSERT3S(i, <, n); used[i] = c; i++; } /* * Initialize the interesting rows of the matrix. */ vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); /* * Invert the matrix. */ vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, invrows, used); /* * Reconstruct the missing data using the generated matrix. */ vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, invrows, used); kmem_free(p, psize); return (code); } static int vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY]; int ntgts; int i, c; int code; int nbadparity, nbaddata; /* * The tgts list must already be sorted. */ for (i = 1; i < nt; i++) { ASSERT(t[i] > t[i - 1]); } nbadparity = rm->rm_firstdatacol; nbaddata = rm->rm_cols - nbadparity; ntgts = 0; for (i = 0, c = 0; c < rm->rm_cols; c++) { if (i < nt && c == t[i]) { tgts[ntgts++] = c; i++; } else if (rm->rm_col[c].rc_error != 0) { tgts[ntgts++] = c; } else if (c >= rm->rm_firstdatacol) { nbaddata--; } else { nbadparity--; } } ASSERT(ntgts >= nt); ASSERT(nbaddata >= 0); ASSERT(nbaddata + nbadparity == ntgts); code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); ASSERT(code > 0); return (code); } static raidz_map_t * vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, uint64_t dcols, uint64_t nparity) { raidz_map_t *rm; uint64_t b = offset >> unit_shift; uint64_t s = size >> unit_shift; uint64_t f = b % dcols; uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; q = s / (dcols - nparity); r = s - q * (dcols - nparity); bc = (r == 0 ? 0 : r + nparity); tot = s + nparity * (q + (r == 0 ? 0 : 1)); if (q == 0) { acols = bc; scols = MIN(dcols, roundup(bc, nparity + 1)); } else { acols = dcols; scols = dcols; } ASSERT3U(acols, <=, scols); rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols])); rm->rm_cols = acols; rm->rm_scols = scols; rm->rm_bigcols = bc; rm->rm_skipstart = bc; rm->rm_missingdata = 0; rm->rm_missingparity = 0; rm->rm_firstdatacol = nparity; rm->rm_reports = 0; rm->rm_freed = 0; rm->rm_ecksuminjected = 0; asize = 0; for (c = 0; c < scols; c++) { col = f + c; coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << unit_shift; } rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; rm->rm_col[c].rc_data = NULL; rm->rm_col[c].rc_error = 0; rm->rm_col[c].rc_tried = 0; rm->rm_col[c].rc_skipped = 0; if (c >= acols) rm->rm_col[c].rc_size = 0; else if (c < bc) rm->rm_col[c].rc_size = (q + 1) << unit_shift; else rm->rm_col[c].rc_size = q << unit_shift; asize += rm->rm_col[c].rc_size; } ASSERT3U(asize, ==, tot << unit_shift); rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size); rm->rm_col[c].rc_data = data; for (c = c + 1; c < acols; c++) rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + rm->rm_col[c - 1].rc_size; /* * If all data stored spans all columns, there's a danger that parity * will always be on the same device and, since parity isn't read * during normal operation, that that device's I/O bandwidth won't be * used effectively. We therefore switch the parity every 1MB. * * ... at least that was, ostensibly, the theory. As a practical * matter unless we juggle the parity between all devices evenly, we * won't see any benefit. Further, occasional writes that aren't a * multiple of the LCM of the number of children and the minimum * stripe width are sufficient to avoid pessimal behavior. * Unfortunately, this decision created an implicit on-disk format * requirement that we need to support for all eternity, but only * for single-parity RAID-Z. * * If we intend to skip a sector in the zeroth column for padding * we must make sure to note this swap. We will never intend to * skip the first column since at least one data and one parity * column must appear in each row. */ ASSERT(rm->rm_cols >= 2); ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { devidx = rm->rm_col[0].rc_devidx; o = rm->rm_col[0].rc_offset; rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; rm->rm_col[1].rc_devidx = devidx; rm->rm_col[1].rc_offset = o; if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } return (rm); } static void vdev_raidz_map_free(raidz_map_t *rm) { int c; for (c = rm->rm_firstdatacol - 1; c >= 0; c--) zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } static vdev_t * vdev_child(vdev_t *pvd, uint64_t devidx) { vdev_t *cvd; STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { if (cvd->v_id == devidx) break; } return (cvd); } /* * We keep track of whether or not there were any injected errors, so that * any ereports we generate can note it. */ static int raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, uint64_t size) { return (zio_checksum_verify(spa, bp, data)); } /* * Generate the parity from the data columns. If we tried and were able to * read the parity without error, verify that the generated parity matches the * data we read. If it doesn't, we fire off a checksum error. Return the * number such failures. */ static int raidz_parity_verify(raidz_map_t *rm) { void *orig[VDEV_RAIDZ_MAXPARITY]; int c, ret = 0; raidz_col_t *rc; for (c = 0; c < rm->rm_firstdatacol; c++) { rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = zfs_alloc(rc->rc_size); bcopy(rc->rc_data, orig[c], rc->rc_size); } vdev_raidz_generate_parity(rm); for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { rc->rc_error = ECKSUM; ret++; } zfs_free(orig[c], rc->rc_size); } return (ret); } /* * Iterate over all combinations of bad data and attempt a reconstruction. * Note that the algorithm below is non-optimal because it doesn't take into * account how reconstruction is actually performed. For example, with * triple-parity RAID-Z the reconstruction procedure is the same if column 4 * is targeted as invalid as if columns 1 and 4 are targeted since in both * cases we'd only use parity information in column 0. */ static int vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors) { raidz_col_t *rc; void *orig[VDEV_RAIDZ_MAXPARITY]; int tstore[VDEV_RAIDZ_MAXPARITY + 2]; int *tgts = &tstore[1]; int current, next, i, c, n; int code, ret = 0; ASSERT(total_errors < rm->rm_firstdatacol); /* * This simplifies one edge condition. */ tgts[-1] = -1; for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { /* * Initialize the targets array by finding the first n columns * that contain no error. * * If there were no data errors, we need to ensure that we're * always explicitly attempting to reconstruct at least one * data column. To do this, we simply push the highest target * up into the data columns. */ for (c = 0, i = 0; i < n; i++) { if (i == n - 1 && data_errors == 0 && c < rm->rm_firstdatacol) { c = rm->rm_firstdatacol; } while (rm->rm_col[c].rc_error != 0) { c++; ASSERT3S(c, <, rm->rm_cols); } tgts[i] = c++; } /* * Setting tgts[n] simplifies the other edge condition. */ tgts[n] = rm->rm_cols; /* * These buffers were allocated in previous iterations. */ for (i = 0; i < n - 1; i++) { ASSERT(orig[i] != NULL); } orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size); current = 0; next = tgts[current]; while (current != n) { tgts[current] = next; current = 0; /* * Save off the original data that we're going to * attempt to reconstruct. */ for (i = 0; i < n; i++) { ASSERT(orig[i] != NULL); c = tgts[i]; ASSERT3S(c, >=, 0); ASSERT3S(c, <, rm->rm_cols); rc = &rm->rm_col[c]; bcopy(rc->rc_data, orig[i], rc->rc_size); } /* * Attempt a reconstruction and exit the outer loop on * success. */ code = vdev_raidz_reconstruct(rm, tgts, n); if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; ASSERT(rc->rc_error == 0); rc->rc_error = ECKSUM; } ret = code; goto done; } /* * Restore the original data. */ for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; bcopy(orig[i], rc->rc_data, rc->rc_size); } do { /* * Find the next valid column after the current * position.. */ for (next = tgts[current] + 1; next < rm->rm_cols && rm->rm_col[next].rc_error != 0; next++) continue; ASSERT(next <= tgts[current + 1]); /* * If that spot is available, we're done here. */ if (next != tgts[current + 1]) break; /* * Otherwise, find the next valid column after * the previous position. */ for (c = tgts[current - 1] + 1; rm->rm_col[c].rc_error != 0; c++) continue; tgts[current] = c; current++; } while (current != n); } } n--; done: for (i = n - 1; i >= 0; i--) { zfs_free(orig[i], rm->rm_col[0].rc_size); } return (ret); } static int vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, off_t offset, size_t bytes) { vdev_t *tvd = vd->v_top; vdev_t *cvd; raidz_map_t *rm; raidz_col_t *rc; int c, error; int unexpected_errors; int parity_errors; int parity_untried; int data_errors; int total_errors; int n; int tgts[VDEV_RAIDZ_MAXPARITY]; int code; rc = NULL; /* gcc */ error = 0; rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, vd->v_nchildren, vd->v_nparity); /* * Iterate over the columns in reverse order so that we hit the parity * last -- any errors along the way will force us to read the parity. */ for (c = rm->rm_cols - 1; c >= 0; c--) { rc = &rm->rm_col[c]; cvd = vdev_child(vd, rc->rc_devidx); if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { if (c >= rm->rm_firstdatacol) rm->rm_missingdata++; else rm->rm_missingparity++; rc->rc_error = ENXIO; rc->rc_tried = 1; /* don't even try */ rc->rc_skipped = 1; continue; } #if 0 /* XXX: Too hard for the boot code. */ if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { if (c >= rm->rm_firstdatacol) rm->rm_missingdata++; else rm->rm_missingparity++; rc->rc_error = ESTALE; rc->rc_skipped = 1; continue; } #endif if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, rc->rc_offset, rc->rc_size); rc->rc_tried = 1; rc->rc_skipped = 0; } } reconstruct: unexpected_errors = 0; parity_errors = 0; parity_untried = 0; data_errors = 0; total_errors = 0; ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; if (rc->rc_error) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ if (c < rm->rm_firstdatacol) parity_errors++; else data_errors++; if (!rc->rc_skipped) unexpected_errors++; total_errors++; } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { parity_untried++; } } /* * There are three potential phases for a read: * 1. produce valid data from the columns read * 2. read all disks and try again * 3. perform combinatorial reconstruction * * Each phase is progressively both more expensive and less likely to * occur. If we encounter more errors than we can repair or all phases * fail, we have no choice but to return an error. */ /* * If the number of errors we saw was correctable -- less than or equal * to the number of parity disks read -- attempt to produce data that * has a valid checksum. Naturally, this case applies in the absence of * any errors. */ if (total_errors <= rm->rm_firstdatacol - parity_untried) { if (data_errors == 0) { if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) { /* * If we read parity information (unnecessarily * as it happens since no reconstruction was * needed) regenerate and verify the parity. * We also regenerate parity when resilvering * so we can write it out to the failed device * later. */ if (parity_errors + parity_untried < rm->rm_firstdatacol) { n = raidz_parity_verify(rm); unexpected_errors += n; ASSERT(parity_errors + n <= rm->rm_firstdatacol); } goto done; } } else { /* * We either attempt to read all the parity columns or * none of them. If we didn't try to read parity, we * wouldn't be here in the correctable case. There must * also have been fewer parity errors than parity * columns or, again, we wouldn't be in this code path. */ ASSERT(parity_untried == 0); ASSERT(parity_errors < rm->rm_firstdatacol); /* * Identify the data columns that reported an error. */ n = 0; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; if (rc->rc_error != 0) { ASSERT(n < VDEV_RAIDZ_MAXPARITY); tgts[n++] = c; } } ASSERT(rm->rm_firstdatacol >= n); code = vdev_raidz_reconstruct(rm, tgts, n); if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) { /* * If we read more parity disks than were used * for reconstruction, confirm that the other * parity disks produced correct data. This * routine is suboptimal in that it regenerates * the parity that we already used in addition * to the parity that we're attempting to * verify, but this should be a relatively * uncommon case, and can be optimized if it * becomes a problem. Note that we regenerate * parity when resilvering so we can write it * out to failed devices later. */ if (parity_errors < rm->rm_firstdatacol - n) { n = raidz_parity_verify(rm); unexpected_errors += n; ASSERT(parity_errors + n <= rm->rm_firstdatacol); } goto done; } } } /* * This isn't a typical situation -- either we got a read * error or a child silently returned bad data. Read every * block so we can try again with as much data and parity as * we can track down. If we've already been through once * before, all children will be marked as tried so we'll * proceed to combinatorial reconstruction. */ unexpected_errors = 1; rm->rm_missingdata = 0; rm->rm_missingparity = 0; n = 0; for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; if (rc->rc_tried) continue; cvd = vdev_child(vd, rc->rc_devidx); ASSERT(cvd != NULL); rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, rc->rc_offset, rc->rc_size); if (rc->rc_error == 0) n++; rc->rc_tried = 1; rc->rc_skipped = 0; } /* * If we managed to read anything more, retry the * reconstruction. */ if (n > 0) goto reconstruct; /* * At this point we've attempted to reconstruct the data given the * errors we detected, and we've attempted to read all columns. There * must, therefore, be one or more additional problems -- silent errors * resulting in invalid data rather than explicit I/O errors resulting * in absent data. We check if there is enough additional data to * possibly reconstruct the data and then perform combinatorial * reconstruction over all possible combinations. If that fails, * we're cooked. */ if (total_errors > rm->rm_firstdatacol) { error = EIO; } else if (total_errors < rm->rm_firstdatacol && (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes, total_errors, data_errors)) != 0) { /* * If we didn't use all the available parity for the * combinatorial reconstruction, verify that the remaining * parity is correct. */ if (code != (1 << rm->rm_firstdatacol) - 1) (void) raidz_parity_verify(rm); } else { /* * We're here because either: * * total_errors == rm_first_datacol, or * vdev_raidz_combrec() failed * * In either case, there is enough bad data to prevent * reconstruction. * * Start checksum ereports for all children which haven't * failed, and the IO wasn't speculative. */ error = ECKSUM; } done: vdev_raidz_map_free(rm); return (error); }