diff --git a/include/os/freebsd/spl/sys/ia32/asm_linkage.h b/include/os/freebsd/spl/sys/ia32/asm_linkage.h
index bbbd22030213..058d600007af 100644
--- a/include/os/freebsd/spl/sys/ia32/asm_linkage.h
+++ b/include/os/freebsd/spl/sys/ia32/asm_linkage.h
@@ -1,178 +1,178 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _IA32_SYS_ASM_LINKAGE_H
 #define	_IA32_SYS_ASM_LINKAGE_H
 
 #define	RET	ret
 
 /* Tell compiler to call assembler like Unix */
 #undef ASMABI
 #define	ASMABI	__attribute__((sysv_abi))
 
 #define	ENDBR
 
 #define	SECTION_TEXT .text
 #define	SECTION_STATIC .data
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #ifdef _ASM	/* The remainder of this file is only for assembly files */
 
 
 /*
  * make annoying differences in assembler syntax go away
  */
 
 /*
  * D16 and A16 are used to insert instructions prefixes; the
  * macros help the assembler code be slightly more portable.
  */
 #if !defined(__GNUC_AS__)
 /*
  * /usr/ccs/bin/as prefixes are parsed as separate instructions
  */
 #define	D16	data16;
 #define	A16	addr16;
 
 /*
  * (There are some weird constructs in constant expressions)
  */
 #define	_CONST(const)		[const]
 #define	_BITNOT(const)		-1!_CONST(const)
 #define	_MUL(a, b)		_CONST(a \* b)
 
 #else
 /*
  * Why not use the 'data16' and 'addr16' prefixes .. well, the
  * assembler doesn't quite believe in real mode, and thus argues with
  * us about what we're trying to do.
  */
 #define	D16	.byte	0x66;
 #define	A16	.byte	0x67;
 
 #define	_CONST(const)		(const)
 #define	_BITNOT(const)		~_CONST(const)
 #define	_MUL(a, b)		_CONST(a * b)
 
 #endif
 
 /*
  * C pointers are different sizes between i386 and amd64.
  * These constants can be used to compute offsets into pointer arrays.
  */
 #if defined(__amd64)
 #define	CLONGSHIFT	3
 #define	CLONGSIZE	8
 #define	CLONGMASK	7
 #elif defined(__i386)
 #define	CLONGSHIFT	2
 #define	CLONGSIZE	4
 #define	CLONGMASK	3
 #endif
 
 /*
  * Since we know we're either ILP32 or LP64 ..
  */
 #define	CPTRSHIFT	CLONGSHIFT
 #define	CPTRSIZE	CLONGSIZE
 #define	CPTRMASK	CLONGMASK
 
 #if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT)
 #error	"inconsistent shift constants"
 #endif
 
 #if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1)
 #error	"inconsistent mask constants"
 #endif
 
 #define	ASM_ENTRY_ALIGN	16
 
 /*
  * SSE register alignment and save areas
  */
 
 #define	XMM_SIZE	16
 #define	XMM_ALIGN	16
 
 /*
  * ENTRY provides the standard procedure entry code and an easy way to
  * insert the calls to mcount for profiling. ENTRY_NP is identical, but
  * never calls mcount.
  */
 #define	ENTRY(x) \
 	.text; \
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x; \
 x:	MCOUNT(x)
 
 #define	ENTRY_NP(x) \
 	.text; \
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x; \
 x:
 
 #define	ENTRY_ALIGN(x, a) \
 	.text; \
-	.align	a; \
+	.balign	a; \
 	.globl	x; \
 x:
 
 /*
  * ENTRY2 is identical to ENTRY but provides two labels for the entry point.
  */
 #define	ENTRY2(x, y) \
 	.text;	\
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x, y; \
 x:; \
 y:	MCOUNT(x)
 
 #define	ENTRY_NP2(x, y) \
 	.text; \
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x, y; \
 x:; \
 y:
 
 
 /*
  * SET_SIZE trails a function and set the size for the ELF symbol table.
  */
 #define	SET_SIZE(x)
 
 #define	SET_OBJ(x)
 
 
 #endif /* _ASM */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _IA32_SYS_ASM_LINKAGE_H */
diff --git a/include/os/linux/spl/sys/ia32/asm_linkage.h b/include/os/linux/spl/sys/ia32/asm_linkage.h
index 2864d9455129..3aaa4af5dab8 100644
--- a/include/os/linux/spl/sys/ia32/asm_linkage.h
+++ b/include/os/linux/spl/sys/ia32/asm_linkage.h
@@ -1,212 +1,212 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _IA32_SYS_ASM_LINKAGE_H
 #define	_IA32_SYS_ASM_LINKAGE_H
 
 #if defined(_KERNEL) && defined(__linux__)
 #include <linux/linkage.h>
 #endif
 
 #ifndef ENDBR
 #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
 /* CSTYLED */
 #if __has_include(<cet.h>)
 
 #include <cet.h>
 
 #ifdef _CET_ENDBR
 #define	ENDBR	_CET_ENDBR
 #endif /* _CET_ENDBR */
 
 #endif /* <cet.h> */
 #endif /* __ELF__ && __CET__ && __has_include */
 #endif /* !ENDBR */
 
 #ifndef ENDBR
 #define	ENDBR
 #endif
 #ifndef RET
 #define	RET	ret
 #endif
 
 /* You can set to nothing on Unix platforms */
 #undef ASMABI
 #define	ASMABI	__attribute__((sysv_abi))
 
 #define	SECTION_TEXT .text
 #define	SECTION_STATIC .section .rodata
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #ifdef _ASM	/* The remainder of this file is only for assembly files */
 
 /*
  * make annoying differences in assembler syntax go away
  */
 
 /*
  * D16 and A16 are used to insert instructions prefixes; the
  * macros help the assembler code be slightly more portable.
  */
 #if !defined(__GNUC_AS__)
 /*
  * /usr/ccs/bin/as prefixes are parsed as separate instructions
  */
 #define	D16	data16;
 #define	A16	addr16;
 
 /*
  * (There are some weird constructs in constant expressions)
  */
 #define	_CONST(const)		[const]
 #define	_BITNOT(const)		-1!_CONST(const)
 #define	_MUL(a, b)		_CONST(a \* b)
 
 #else
 /*
  * Why not use the 'data16' and 'addr16' prefixes .. well, the
  * assembler doesn't quite believe in real mode, and thus argues with
  * us about what we're trying to do.
  */
 #define	D16	.byte	0x66;
 #define	A16	.byte	0x67;
 
 #define	_CONST(const)		(const)
 #define	_BITNOT(const)		~_CONST(const)
 #define	_MUL(a, b)		_CONST(a * b)
 
 #endif
 
 /*
  * C pointers are different sizes between i386 and amd64.
  * These constants can be used to compute offsets into pointer arrays.
  */
 #if defined(__amd64)
 #define	CLONGSHIFT	3
 #define	CLONGSIZE	8
 #define	CLONGMASK	7
 #elif defined(__i386)
 #define	CLONGSHIFT	2
 #define	CLONGSIZE	4
 #define	CLONGMASK	3
 #endif
 
 /*
  * Since we know we're either ILP32 or LP64 ..
  */
 #define	CPTRSHIFT	CLONGSHIFT
 #define	CPTRSIZE	CLONGSIZE
 #define	CPTRMASK	CLONGMASK
 
 #if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT)
 #error	"inconsistent shift constants"
 #endif
 
 #if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1)
 #error	"inconsistent mask constants"
 #endif
 
 #define	ASM_ENTRY_ALIGN	16
 
 /*
  * SSE register alignment and save areas
  */
 
 #define	XMM_SIZE	16
 #define	XMM_ALIGN	16
 
 /*
  * ENTRY provides the standard procedure entry code and an easy way to
  * insert the calls to mcount for profiling. ENTRY_NP is identical, but
  * never calls mcount.
  */
 #undef ENTRY
 #define	ENTRY(x) \
 	.text; \
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x; \
 	.type	x, @function; \
 x:	MCOUNT(x)
 
 #define	ENTRY_NP(x) \
 	.text; \
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x; \
 	.type	x, @function; \
 x:
 
 #define	ENTRY_ALIGN(x, a) \
 	.text; \
-	.align	a; \
+	.balign	a; \
 	.globl	x; \
 	.type	x, @function; \
 x:
 
 #define	FUNCTION(x) \
 	.type	x, @function; \
 x:
 
 /*
  * ENTRY2 is identical to ENTRY but provides two labels for the entry point.
  */
 #define	ENTRY2(x, y) \
 	.text;	\
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x, y; \
 	.type	x, @function; \
 	.type	y, @function; \
 x:; \
 y:	MCOUNT(x)
 
 #define	ENTRY_NP2(x, y) \
 	.text; \
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x, y; \
 	.type	x, @function; \
 	.type	y, @function; \
 x:; \
 y:
 
 
 /*
  * SET_SIZE trails a function and set the size for the ELF symbol table.
  */
 #define	SET_SIZE(x) \
 	.size	x, [.-x]
 
 #define	SET_OBJ(x) .type	x, @object
 
 
 #endif /* _ASM */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _IA32_SYS_ASM_LINKAGE_H */
diff --git a/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h b/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h
index 3b4beecc5d34..9964f183cc68 100644
--- a/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h
+++ b/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h
@@ -1,184 +1,184 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _IA32_SYS_ASM_LINKAGE_H
 #define	_IA32_SYS_ASM_LINKAGE_H
 
 #if defined(__linux__) && defined(CONFIG_SLS)
 #define	RET	ret; int3
 #else
 #define	RET	ret
 #endif
 
 /* Tell compiler to call assembler like Unix */
 #undef ASMABI
 #define	ASMABI	__attribute__((sysv_abi))
 
 #define	ENDBR
 
 #define	SECTION_TEXT .text
 #define	SECTION_STATIC .data
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #ifdef _ASM	/* The remainder of this file is only for assembly files */
 
 /*
  * make annoying differences in assembler syntax go away
  */
 
 /*
  * D16 and A16 are used to insert instructions prefixes; the
  * macros help the assembler code be slightly more portable.
  */
 #if !defined(__GNUC_AS__)
 /*
  * /usr/ccs/bin/as prefixes are parsed as separate instructions
  */
 #define	D16	data16;
 #define	A16	addr16;
 
 /*
  * (There are some weird constructs in constant expressions)
  */
 #define	_CONST(const)		[const]
 #define	_BITNOT(const)		-1!_CONST(const)
 #define	_MUL(a, b)		_CONST(a \* b)
 
 #else
 /*
  * Why not use the 'data16' and 'addr16' prefixes .. well, the
  * assembler doesn't quite believe in real mode, and thus argues with
  * us about what we're trying to do.
  */
 #define	D16	.byte	0x66;
 #define	A16	.byte	0x67;
 
 #define	_CONST(const)		(const)
 #define	_BITNOT(const)		~_CONST(const)
 #define	_MUL(a, b)		_CONST(a * b)
 
 #endif
 
 /*
  * C pointers are different sizes between i386 and amd64.
  * These constants can be used to compute offsets into pointer arrays.
  */
 #if defined(__amd64)
 #define	CLONGSHIFT	3
 #define	CLONGSIZE	8
 #define	CLONGMASK	7
 #elif defined(__i386)
 #define	CLONGSHIFT	2
 #define	CLONGSIZE	4
 #define	CLONGMASK	3
 #endif
 
 /*
  * Since we know we're either ILP32 or LP64 ..
  */
 #define	CPTRSHIFT	CLONGSHIFT
 #define	CPTRSIZE	CLONGSIZE
 #define	CPTRMASK	CLONGMASK
 
 #if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT)
 #error	"inconsistent shift constants"
 #endif
 
 #if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1)
 #error	"inconsistent mask constants"
 #endif
 
 #define	ASM_ENTRY_ALIGN	16
 
 /*
  * SSE register alignment and save areas
  */
 
 #define	XMM_SIZE	16
 #define	XMM_ALIGN	16
 
 /*
  * ENTRY provides the standard procedure entry code and an easy way to
  * insert the calls to mcount for profiling. ENTRY_NP is identical, but
  * never calls mcount.
  */
 #define	ENTRY(x) \
 	.text; \
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x; \
 x:	MCOUNT(x)
 
 #define	ENTRY_NP(x) \
 	.text; \
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x; \
 x:
 
 #define	ENTRY_ALIGN(x, a) \
 	.text; \
-	.align	a; \
+	.balign	a; \
 	.globl	x; \
 x:
 
 #define	FUNCTION(x) \
 	.type   x, @function; \
 x:
 
 /*
  * ENTRY2 is identical to ENTRY but provides two labels for the entry point.
  */
 #define	ENTRY2(x, y) \
 	.text;	\
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x, y; \
 x:; \
 y:	MCOUNT(x)
 
 #define	ENTRY_NP2(x, y) \
 	.text; \
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x, y; \
 x:; \
 y:
 
 
 /*
  * SET_SIZE trails a function and set the size for the ELF symbol table.
  */
 #define	SET_SIZE(x)
 
 #define	SET_OBJ(x)
 
 #endif /* _ASM */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _IA32_SYS_ASM_LINKAGE_H */
diff --git a/lib/libspl/include/os/linux/sys/ia32/asm_linkage.h b/lib/libspl/include/os/linux/sys/ia32/asm_linkage.h
index 76765dd040cf..f07596123341 100644
--- a/lib/libspl/include/os/linux/sys/ia32/asm_linkage.h
+++ b/lib/libspl/include/os/linux/sys/ia32/asm_linkage.h
@@ -1,211 +1,211 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _IA32_SYS_ASM_LINKAGE_H
 #define	_IA32_SYS_ASM_LINKAGE_H
 
 #if defined(_KERNEL) && defined(__linux__)
 #include <linux/linkage.h>
 #endif
 
 #ifndef ENDBR
 #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
 /* CSTYLED */
 #if __has_include(<cet.h>)
 
 #include <cet.h>
 
 #ifdef _CET_ENDBR
 #define	ENDBR	_CET_ENDBR
 #endif /* _CET_ENDBR */
 
 #endif /* <cet.h> */
 #endif /* __ELF__ && __CET__ && __has_include */
 #endif /* !ENDBR */
 
 #ifndef ENDBR
 #define	ENDBR
 #endif
 #ifndef RET
 #define	RET	ret
 #endif
 
 /* You can set to nothing on Unix platforms */
 #undef ASMABI
 #define	ASMABI	__attribute__((sysv_abi))
 
 #define	SECTION_TEXT .text
 #define	SECTION_STATIC .section .rodata
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #ifdef _ASM	/* The remainder of this file is only for assembly files */
 
 /*
  * make annoying differences in assembler syntax go away
  */
 
 /*
  * D16 and A16 are used to insert instructions prefixes; the
  * macros help the assembler code be slightly more portable.
  */
 #if !defined(__GNUC_AS__)
 /*
  * /usr/ccs/bin/as prefixes are parsed as separate instructions
  */
 #define	D16	data16;
 #define	A16	addr16;
 
 /*
  * (There are some weird constructs in constant expressions)
  */
 #define	_CONST(const)		[const]
 #define	_BITNOT(const)		-1!_CONST(const)
 #define	_MUL(a, b)		_CONST(a \* b)
 
 #else
 /*
  * Why not use the 'data16' and 'addr16' prefixes .. well, the
  * assembler doesn't quite believe in real mode, and thus argues with
  * us about what we're trying to do.
  */
 #define	D16	.byte	0x66;
 #define	A16	.byte	0x67;
 
 #define	_CONST(const)		(const)
 #define	_BITNOT(const)		~_CONST(const)
 #define	_MUL(a, b)		_CONST(a * b)
 
 #endif
 
 /*
  * C pointers are different sizes between i386 and amd64.
  * These constants can be used to compute offsets into pointer arrays.
  */
 #if defined(__amd64)
 #define	CLONGSHIFT	3
 #define	CLONGSIZE	8
 #define	CLONGMASK	7
 #elif defined(__i386)
 #define	CLONGSHIFT	2
 #define	CLONGSIZE	4
 #define	CLONGMASK	3
 #endif
 
 /*
  * Since we know we're either ILP32 or LP64 ..
  */
 #define	CPTRSHIFT	CLONGSHIFT
 #define	CPTRSIZE	CLONGSIZE
 #define	CPTRMASK	CLONGMASK
 
 #if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT)
 #error	"inconsistent shift constants"
 #endif
 
 #if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1)
 #error	"inconsistent mask constants"
 #endif
 
 #define	ASM_ENTRY_ALIGN	16
 
 /*
  * SSE register alignment and save areas
  */
 
 #define	XMM_SIZE	16
 #define	XMM_ALIGN	16
 
 /*
  * ENTRY provides the standard procedure entry code and an easy way to
  * insert the calls to mcount for profiling. ENTRY_NP is identical, but
  * never calls mcount.
  */
 #undef ENTRY
 #define	ENTRY(x) \
 	.text; \
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x; \
 	.type	x, @function; \
 x:	MCOUNT(x)
 
 #define	ENTRY_NP(x) \
 	.text; \
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x; \
 	.type	x, @function; \
 x:
 
 #define	ENTRY_ALIGN(x, a) \
 	.text; \
-	.align	a; \
+	.balign	a; \
 	.globl	x; \
 	.type	x, @function; \
 x:
 
 #define	FUNCTION(x) \
 	.type	x, @function; \
 x:
 
 /*
  * ENTRY2 is identical to ENTRY but provides two labels for the entry point.
  */
 #define	ENTRY2(x, y) \
 	.text;	\
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x, y; \
 	.type	x, @function; \
 	.type	y, @function; \
 x:; \
 y:	MCOUNT(x)
 
 #define	ENTRY_NP2(x, y) \
 	.text; \
-	.align	ASM_ENTRY_ALIGN; \
+	.balign	ASM_ENTRY_ALIGN; \
 	.globl	x, y; \
 	.type	x, @function; \
 	.type	y, @function; \
 x:; \
 y:
 
 
 /*
  * SET_SIZE trails a function and set the size for the ELF symbol table.
  */
 #define	SET_SIZE(x) \
 	.size	x, [.-x]
 
 #define	SET_OBJ(x) .type	x, @object
 
 #endif /* _ASM */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _IA32_SYS_ASM_LINKAGE_H */
diff --git a/module/icp/asm-x86_64/aes/aes_aesni.S b/module/icp/asm-x86_64/aes/aes_aesni.S
index f622235bd15b..4f3fe3ec65d6 100644
--- a/module/icp/asm-x86_64/aes/aes_aesni.S
+++ b/module/icp/asm-x86_64/aes/aes_aesni.S
@@ -1,748 +1,748 @@
 /*
  * ====================================================================
  * Written by Intel Corporation for the OpenSSL project to add support
  * for Intel AES-NI instructions. Rights for redistribution and usage
  * in source and binary forms are granted according to the OpenSSL
  * license.
  *
  *   Author: Huang Ying <ying.huang at intel dot com>
  *           Vinodh Gopal <vinodh.gopal at intel dot com>
  *           Kahraman Akdemir
  *
  * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
  * instructions that are going to be introduced in the next generation
  * of Intel processor, as of 2009. These instructions enable fast and
  * secure data encryption and decryption, using the Advanced Encryption
  * Standard (AES), defined by FIPS Publication number 197. The
  * architecture introduces six instructions that offer full hardware
  * support for AES. Four of them support high performance data
  * encryption and decryption, and the other two instructions support
  * the AES key expansion procedure.
  * ====================================================================
  */
 
 /*
  * ====================================================================
  * Copyright (c) 1998-2008 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * 3. All advertising materials mentioning features or use of this
  *    software must display the following acknowledgment:
  *    "This product includes software developed by the OpenSSL Project
  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
  *
  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
  *    endorse or promote products derived from this software without
  *    prior written permission. For written permission, please contact
  *    openssl-core@openssl.org.
  *
  * 5. Products derived from this software may not be called "OpenSSL"
  *    nor may "OpenSSL" appear in their names without prior written
  *    permission of the OpenSSL Project.
  *
  * 6. Redistributions of any form whatsoever must retain the following
  *    acknowledgment:
  *    "This product includes software developed by the OpenSSL Project
  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
  *
  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
  * OF THE POSSIBILITY OF SUCH DAMAGE.
  * ====================================================================
  */
 
 /*
  * ====================================================================
  * OpenSolaris OS modifications
  *
  * This source originates as files aes-intel.S and eng_aesni_asm.pl, in
  * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by
  * Huang Ying of Intel to the openssl-dev mailing list under the subject
  * of "Add support to Intel AES-NI instruction set for x86_64 platform".
  *
  * This OpenSolaris version has these major changes from the original source:
  *
  * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
  * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
  * definitions for lint.
  *
  * 2. Formatted code, added comments, and added #includes and #defines.
  *
  * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
  * calling kpreempt_disable() and kpreempt_enable().
  * If the TS bit is not set, Save and restore %xmm registers at the beginning
  * and end of function calls (%xmm* registers are not saved and restored by
  * during kernel thread preemption).
  *
  * 4. Renamed functions, reordered parameters, and changed return value
  * to match OpenSolaris:
  *
  * OpenSSL interface:
  *	int intel_AES_set_encrypt_key(const unsigned char *userKey,
  *		const int bits, AES_KEY *key);
  *	int intel_AES_set_decrypt_key(const unsigned char *userKey,
  *		const int bits, AES_KEY *key);
  *	Return values for above are non-zero on error, 0 on success.
  *
  *	void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
  *		const AES_KEY *key);
  *	void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
  *		const AES_KEY *key);
  *	typedef struct aes_key_st {
  *		unsigned int	rd_key[4 *(AES_MAXNR + 1)];
  *		int		rounds;
  *		unsigned int	pad[3];
  *	} AES_KEY;
  * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules
  * (ks32) instead of 64-bit (ks64).
  * Number of rounds (aka round count) is at offset 240 of AES_KEY.
  *
  * OpenSolaris OS interface (#ifdefs removed for readability):
  *	int rijndael_key_setup_dec_intel(uint32_t rk[],
  *		const uint32_t cipherKey[], uint64_t keyBits);
  *	int rijndael_key_setup_enc_intel(uint32_t rk[],
  *		const uint32_t cipherKey[], uint64_t keyBits);
  *	Return values for above are 0 on error, number of rounds on success.
  *
  *	void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
  *		const uint32_t pt[4], uint32_t ct[4]);
  *	void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
  *		const uint32_t pt[4], uint32_t ct[4]);
  *	typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
  *		 uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
  *
  *	typedef union {
  *		uint32_t	ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
  *	} aes_ks_t;
  *	typedef struct aes_key {
  *		aes_ks_t	encr_ks, decr_ks;
  *		long double	align128;
  *		int		flags, nr, type;
  *	} aes_key_t;
  *
  * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
  * ct is crypto text, and MAX_AES_NR is 14.
  * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
  *
  * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
  *
  * ====================================================================
  */
 
 
 #if defined(lint) || defined(__lint)
 
 #include <sys/types.h>
 
 void
 aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
     uint32_t ct[4]) {
 	(void) rk, (void) Nr, (void) pt, (void) ct;
 }
 void
 aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
     uint32_t pt[4]) {
 	(void) rk, (void) Nr, (void) ct, (void) pt;
 }
 int
 rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
     uint64_t keyBits) {
 	(void) rk, (void) cipherKey, (void) keyBits;
 	return (0);
 }
 int
 rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
    uint64_t keyBits) {
 	(void) rk, (void) cipherKey, (void) keyBits;
 	return (0);
 }
 
 
 #elif defined(HAVE_AES)	/* guard by instruction set */
 
 #define _ASM
 #include <sys/asm_linkage.h>
 
 /*
  * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
  * _key_expansion_256a(), _key_expansion_256b()
  *
  * Helper functions called by rijndael_key_setup_inc_intel().
  * Also used indirectly by rijndael_key_setup_dec_intel().
  *
  * Input:
  * %xmm0	User-provided cipher key
  * %xmm1	Round constant
  * Output:
  * (%rcx)	AES key
  */
 
 ENTRY_NP2(_key_expansion_128, _key_expansion_256a)
 _key_expansion_128_local:
 _key_expansion_256a_local:
 	pshufd	$0b11111111, %xmm1, %xmm1
 	shufps	$0b00010000, %xmm0, %xmm4
 	pxor	%xmm4, %xmm0
 	shufps	$0b10001100, %xmm0, %xmm4
 	pxor	%xmm4, %xmm0
 	pxor	%xmm1, %xmm0
 	movups	%xmm0, (%rcx)
 	add	$0x10, %rcx
 	RET
 	nop
 SET_SIZE(_key_expansion_128)
 SET_SIZE(_key_expansion_256a)
 
 
 ENTRY_NP(_key_expansion_192a)
 _key_expansion_192a_local:
 	pshufd	$0b01010101, %xmm1, %xmm1
 	shufps	$0b00010000, %xmm0, %xmm4
 	pxor	%xmm4, %xmm0
 	shufps	$0b10001100, %xmm0, %xmm4
 	pxor	%xmm4, %xmm0
 	pxor	%xmm1, %xmm0
 
 	movups	%xmm2, %xmm5
 	movups	%xmm2, %xmm6
 	pslldq	$4, %xmm5
 	pshufd	$0b11111111, %xmm0, %xmm3
 	pxor	%xmm3, %xmm2
 	pxor	%xmm5, %xmm2
 
 	movups	%xmm0, %xmm1
 	shufps	$0b01000100, %xmm0, %xmm6
 	movups	%xmm6, (%rcx)
 	shufps	$0b01001110, %xmm2, %xmm1
 	movups	%xmm1, 0x10(%rcx)
 	add	$0x20, %rcx
 	RET
 SET_SIZE(_key_expansion_192a)
 
 
 ENTRY_NP(_key_expansion_192b)
 _key_expansion_192b_local:
 	pshufd	$0b01010101, %xmm1, %xmm1
 	shufps	$0b00010000, %xmm0, %xmm4
 	pxor	%xmm4, %xmm0
 	shufps	$0b10001100, %xmm0, %xmm4
 	pxor	%xmm4, %xmm0
 	pxor	%xmm1, %xmm0
 
 	movups	%xmm2, %xmm5
 	pslldq	$4, %xmm5
 	pshufd	$0b11111111, %xmm0, %xmm3
 	pxor	%xmm3, %xmm2
 	pxor	%xmm5, %xmm2
 
 	movups	%xmm0, (%rcx)
 	add	$0x10, %rcx
 	RET
 SET_SIZE(_key_expansion_192b)
 
 
 ENTRY_NP(_key_expansion_256b)
 _key_expansion_256b_local:
 	pshufd	$0b10101010, %xmm1, %xmm1
 	shufps	$0b00010000, %xmm2, %xmm4
 	pxor	%xmm4, %xmm2
 	shufps	$0b10001100, %xmm2, %xmm4
 	pxor	%xmm4, %xmm2
 	pxor	%xmm1, %xmm2
 	movups	%xmm2, (%rcx)
 	add	$0x10, %rcx
 	RET
 SET_SIZE(_key_expansion_256b)
 
 
 /*
  * rijndael_key_setup_enc_intel()
  * Expand the cipher key into the encryption key schedule.
  *
  * For kernel code, caller is responsible for ensuring kpreempt_disable()
  * has been called.  This is because %xmm registers are not saved/restored.
  * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
  * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
  * on the stack.
  *
  * OpenSolaris interface:
  * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
  *	uint64_t keyBits);
  * Return value is 0 on error, number of rounds on success.
  *
  * Original Intel OpenSSL interface:
  * int intel_AES_set_encrypt_key(const unsigned char *userKey,
  *	const int bits, AES_KEY *key);
  * Return value is non-zero on error, 0 on success.
  */
 
 #ifdef	OPENSSL_INTERFACE
 #define	rijndael_key_setup_enc_intel	intel_AES_set_encrypt_key
 #define	rijndael_key_setup_dec_intel	intel_AES_set_decrypt_key
 
 #define	USERCIPHERKEY		rdi	/* P1, 64 bits */
 #define	KEYSIZE32		esi	/* P2, 32 bits */
 #define	KEYSIZE64		rsi	/* P2, 64 bits */
 #define	AESKEY			rdx	/* P3, 64 bits */
 
 #else	/* OpenSolaris Interface */
 #define	AESKEY			rdi	/* P1, 64 bits */
 #define	USERCIPHERKEY		rsi	/* P2, 64 bits */
 #define	KEYSIZE32		edx	/* P3, 32 bits */
 #define	KEYSIZE64		rdx	/* P3, 64 bits */
 #endif	/* OPENSSL_INTERFACE */
 
 #define	ROUNDS32		KEYSIZE32	/* temp */
 #define	ROUNDS64		KEYSIZE64	/* temp */
 #define	ENDAESKEY		USERCIPHERKEY	/* temp */
 
 ENTRY_NP(rijndael_key_setup_enc_intel)
 rijndael_key_setup_enc_intel_local:
 	FRAME_BEGIN
 	// NULL pointer sanity check
 	test	%USERCIPHERKEY, %USERCIPHERKEY
 	jz	.Lenc_key_invalid_param
 	test	%AESKEY, %AESKEY
 	jz	.Lenc_key_invalid_param
 
 	movups	(%USERCIPHERKEY), %xmm0	// user key (first 16 bytes)
 	movups	%xmm0, (%AESKEY)
 	lea	0x10(%AESKEY), %rcx	// key addr
 	pxor	%xmm4, %xmm4		// xmm4 is assumed 0 in _key_expansion_x
 
 	cmp	$256, %KEYSIZE32
 	jnz	.Lenc_key192
 
 	// AES 256: 14 rounds in encryption key schedule
 #ifdef OPENSSL_INTERFACE
 	mov	$14, %ROUNDS32
 	movl	%ROUNDS32, 240(%AESKEY)		// key.rounds = 14
 #endif	/* OPENSSL_INTERFACE */
 
 	movups	0x10(%USERCIPHERKEY), %xmm2	// other user key (2nd 16 bytes)
 	movups	%xmm2, (%rcx)
 	add	$0x10, %rcx
 
 	aeskeygenassist $0x1, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_256a_local
 	aeskeygenassist $0x1, %xmm0, %xmm1
 	call	_key_expansion_256b_local
 	aeskeygenassist $0x2, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_256a_local
 	aeskeygenassist $0x2, %xmm0, %xmm1
 	call	_key_expansion_256b_local
 	aeskeygenassist $0x4, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_256a_local
 	aeskeygenassist $0x4, %xmm0, %xmm1
 	call	_key_expansion_256b_local
 	aeskeygenassist $0x8, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_256a_local
 	aeskeygenassist $0x8, %xmm0, %xmm1
 	call	_key_expansion_256b_local
 	aeskeygenassist $0x10, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_256a_local
 	aeskeygenassist $0x10, %xmm0, %xmm1
 	call	_key_expansion_256b_local
 	aeskeygenassist $0x20, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_256a_local
 	aeskeygenassist $0x20, %xmm0, %xmm1
 	call	_key_expansion_256b_local
 	aeskeygenassist $0x40, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_256a_local
 
 #ifdef	OPENSSL_INTERFACE
 	xor	%rax, %rax			// return 0 (OK)
 #else	/* Open Solaris Interface */
 	mov	$14, %rax			// return # rounds = 14
 #endif
 	FRAME_END
 	RET
 
-.align 4
+.balign 4
 .Lenc_key192:
 	cmp	$192, %KEYSIZE32
 	jnz	.Lenc_key128
 
 	// AES 192: 12 rounds in encryption key schedule
 #ifdef OPENSSL_INTERFACE
 	mov	$12, %ROUNDS32
 	movl	%ROUNDS32, 240(%AESKEY)	// key.rounds = 12
 #endif	/* OPENSSL_INTERFACE */
 
 	movq	0x10(%USERCIPHERKEY), %xmm2	// other user key
 	aeskeygenassist $0x1, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_192a_local
 	aeskeygenassist $0x2, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_192b_local
 	aeskeygenassist $0x4, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_192a_local
 	aeskeygenassist $0x8, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_192b_local
 	aeskeygenassist $0x10, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_192a_local
 	aeskeygenassist $0x20, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_192b_local
 	aeskeygenassist $0x40, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_192a_local
 	aeskeygenassist $0x80, %xmm2, %xmm1	// expand the key
 	call	_key_expansion_192b_local
 
 #ifdef	OPENSSL_INTERFACE
 	xor	%rax, %rax			// return 0 (OK)
 #else	/* OpenSolaris Interface */
 	mov	$12, %rax			// return # rounds = 12
 #endif
 	FRAME_END
 	RET
 
-.align 4
+.balign 4
 .Lenc_key128:
 	cmp $128, %KEYSIZE32
 	jnz .Lenc_key_invalid_key_bits
 
 	// AES 128: 10 rounds in encryption key schedule
 #ifdef OPENSSL_INTERFACE
 	mov	$10, %ROUNDS32
 	movl	%ROUNDS32, 240(%AESKEY)		// key.rounds = 10
 #endif	/* OPENSSL_INTERFACE */
 
 	aeskeygenassist $0x1, %xmm0, %xmm1	// expand the key
 	call	_key_expansion_128_local
 	aeskeygenassist $0x2, %xmm0, %xmm1	// expand the key
 	call	_key_expansion_128_local
 	aeskeygenassist $0x4, %xmm0, %xmm1	// expand the key
 	call	_key_expansion_128_local
 	aeskeygenassist $0x8, %xmm0, %xmm1	// expand the key
 	call	_key_expansion_128_local
 	aeskeygenassist $0x10, %xmm0, %xmm1	// expand the key
 	call	_key_expansion_128_local
 	aeskeygenassist $0x20, %xmm0, %xmm1	// expand the key
 	call	_key_expansion_128_local
 	aeskeygenassist $0x40, %xmm0, %xmm1	// expand the key
 	call	_key_expansion_128_local
 	aeskeygenassist $0x80, %xmm0, %xmm1	// expand the key
 	call	_key_expansion_128_local
 	aeskeygenassist $0x1b, %xmm0, %xmm1	// expand the key
 	call	_key_expansion_128_local
 	aeskeygenassist $0x36, %xmm0, %xmm1	// expand the key
 	call	_key_expansion_128_local
 
 #ifdef	OPENSSL_INTERFACE
 	xor	%rax, %rax			// return 0 (OK)
 #else	/* OpenSolaris Interface */
 	mov	$10, %rax			// return # rounds = 10
 #endif
 	FRAME_END
 	RET
 
 .Lenc_key_invalid_param:
 #ifdef	OPENSSL_INTERFACE
 	mov	$-1, %rax	// user key or AES key pointer is NULL
 	FRAME_END
 	RET
 #else
 	/* FALLTHROUGH */
 #endif	/* OPENSSL_INTERFACE */
 
 .Lenc_key_invalid_key_bits:
 #ifdef	OPENSSL_INTERFACE
 	mov	$-2, %rax	// keysize is invalid
 #else	/* Open Solaris Interface */
 	xor	%rax, %rax	// a key pointer is NULL or invalid keysize
 #endif	/* OPENSSL_INTERFACE */
 	FRAME_END
 	RET
 	SET_SIZE(rijndael_key_setup_enc_intel)
 
 
 /*
  * rijndael_key_setup_dec_intel()
  * Expand the cipher key into the decryption key schedule.
  *
  * For kernel code, caller is responsible for ensuring kpreempt_disable()
  * has been called.  This is because %xmm registers are not saved/restored.
  * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
  * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
  * on the stack.
  *
  * OpenSolaris interface:
  * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
  *	uint64_t keyBits);
  * Return value is 0 on error, number of rounds on success.
  * P1->P2, P2->P3, P3->P1
  *
  * Original Intel OpenSSL interface:
  * int intel_AES_set_decrypt_key(const unsigned char *userKey,
  *	const int bits, AES_KEY *key);
  * Return value is non-zero on error, 0 on success.
  */
 
 ENTRY_NP(rijndael_key_setup_dec_intel)
 FRAME_BEGIN
 	// Generate round keys used for encryption
 	call	rijndael_key_setup_enc_intel_local
 	test	%rax, %rax
 #ifdef	OPENSSL_INTERFACE
 	jnz	.Ldec_key_exit	// Failed if returned non-0
 #else	/* OpenSolaris Interface */
 	jz	.Ldec_key_exit	// Failed if returned 0
 #endif	/* OPENSSL_INTERFACE */
 
 	/*
 	 * Convert round keys used for encryption
 	 * to a form usable for decryption
 	 */
 #ifndef	OPENSSL_INTERFACE		/* OpenSolaris Interface */
 	mov	%rax, %ROUNDS64		// set # rounds (10, 12, or 14)
 					// (already set for OpenSSL)
 #endif
 
 	lea	0x10(%AESKEY), %rcx	// key addr
 	shl	$4, %ROUNDS32
 	add	%AESKEY, %ROUNDS64
 	mov	%ROUNDS64, %ENDAESKEY
 
-.align 4
+.balign 4
 .Ldec_key_reorder_loop:
 	movups	(%AESKEY), %xmm0
 	movups	(%ROUNDS64), %xmm1
 	movups	%xmm0, (%ROUNDS64)
 	movups	%xmm1, (%AESKEY)
 	lea	0x10(%AESKEY), %AESKEY
 	lea	-0x10(%ROUNDS64), %ROUNDS64
 	cmp	%AESKEY, %ROUNDS64
 	ja	.Ldec_key_reorder_loop
 
-.align 4
+.balign 4
 .Ldec_key_inv_loop:
 	movups	(%rcx), %xmm0
 	// Convert an encryption round key to a form usable for decryption
 	// with the "AES Inverse Mix Columns" instruction
 	aesimc	%xmm0, %xmm1
 	movups	%xmm1, (%rcx)
 	lea	0x10(%rcx), %rcx
 	cmp	%ENDAESKEY, %rcx
 	jnz	.Ldec_key_inv_loop
 
 .Ldec_key_exit:
 	// OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
 	// OpenSSL: rax = 0 for OK, or non-zero for error
 	FRAME_END
 	RET
 	SET_SIZE(rijndael_key_setup_dec_intel)
 
 
 /*
  * aes_encrypt_intel()
  * Encrypt a single block (in and out can overlap).
  *
  * For kernel code, caller is responsible for ensuring kpreempt_disable()
  * has been called.  This is because %xmm registers are not saved/restored.
  * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
  * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
  * on the stack.
  *
  * Temporary register usage:
  * %xmm0	State
  * %xmm1	Key
  *
  * Original OpenSolaris Interface:
  * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
  *	const uint32_t pt[4], uint32_t ct[4])
  *
  * Original Intel OpenSSL Interface:
  * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
  *	const AES_KEY *key)
  */
 
 #ifdef	OPENSSL_INTERFACE
 #define	aes_encrypt_intel	intel_AES_encrypt
 #define	aes_decrypt_intel	intel_AES_decrypt
 
 #define	INP		rdi	/* P1, 64 bits */
 #define	OUTP		rsi	/* P2, 64 bits */
 #define	KEYP		rdx	/* P3, 64 bits */
 
 /* No NROUNDS parameter--offset 240 from KEYP saved in %ecx:  */
 #define	NROUNDS32	ecx	/* temporary, 32 bits */
 #define	NROUNDS		cl	/* temporary,  8 bits */
 
 #else	/* OpenSolaris Interface */
 #define	KEYP		rdi	/* P1, 64 bits */
 #define	NROUNDS		esi	/* P2, 32 bits */
 #define	INP		rdx	/* P3, 64 bits */
 #define	OUTP		rcx	/* P4, 64 bits */
 #endif	/* OPENSSL_INTERFACE */
 
 #define	STATE		xmm0	/* temporary, 128 bits */
 #define	KEY		xmm1	/* temporary, 128 bits */
 
 
 ENTRY_NP(aes_encrypt_intel)
 
 	movups	(%INP), %STATE			// input
 	movups	(%KEYP), %KEY			// key
 #ifdef	OPENSSL_INTERFACE
 	mov	240(%KEYP), %NROUNDS32		// round count
 #else	/* OpenSolaris Interface */
 	/* Round count is already present as P2 in %rsi/%esi */
 #endif	/* OPENSSL_INTERFACE */
 
 	pxor	%KEY, %STATE			// round 0
 	lea	0x30(%KEYP), %KEYP
 	cmp	$12, %NROUNDS
 	jb	.Lenc128
 	lea	0x20(%KEYP), %KEYP
 	je	.Lenc192
 
 	// AES 256
 	lea	0x20(%KEYP), %KEYP
 	movups	-0x60(%KEYP), %KEY
 	aesenc	%KEY, %STATE
 	movups	-0x50(%KEYP), %KEY
 	aesenc	%KEY, %STATE
 
-.align 4
+.balign 4
 .Lenc192:
 	// AES 192 and 256
 	movups	-0x40(%KEYP), %KEY
 	aesenc	%KEY, %STATE
 	movups	-0x30(%KEYP), %KEY
 	aesenc	%KEY, %STATE
 
-.align 4
+.balign 4
 .Lenc128:
 	// AES 128, 192, and 256
 	movups	-0x20(%KEYP), %KEY
 	aesenc	%KEY, %STATE
 	movups	-0x10(%KEYP), %KEY
 	aesenc	%KEY, %STATE
 	movups	(%KEYP), %KEY
 	aesenc	%KEY, %STATE
 	movups	0x10(%KEYP), %KEY
 	aesenc	%KEY, %STATE
 	movups	0x20(%KEYP), %KEY
 	aesenc	%KEY, %STATE
 	movups	0x30(%KEYP), %KEY
 	aesenc	%KEY, %STATE
 	movups	0x40(%KEYP), %KEY
 	aesenc	%KEY, %STATE
 	movups	0x50(%KEYP), %KEY
 	aesenc	%KEY, %STATE
 	movups	0x60(%KEYP), %KEY
 	aesenc	%KEY, %STATE
 	movups	0x70(%KEYP), %KEY
 	aesenclast	 %KEY, %STATE		// last round
 	movups	%STATE, (%OUTP)			// output
 
 	RET
 	SET_SIZE(aes_encrypt_intel)
 
 
 /*
  * aes_decrypt_intel()
  * Decrypt a single block (in and out can overlap).
  *
  * For kernel code, caller is responsible for ensuring kpreempt_disable()
  * has been called.  This is because %xmm registers are not saved/restored.
  * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
  * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
  * on the stack.
  *
  * Temporary register usage:
  * %xmm0	State
  * %xmm1	Key
  *
  * Original OpenSolaris Interface:
  * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
  *	const uint32_t pt[4], uint32_t ct[4])/
  *
  * Original Intel OpenSSL Interface:
  * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
  *	const AES_KEY *key);
  */
 ENTRY_NP(aes_decrypt_intel)
 
 	movups	(%INP), %STATE			// input
 	movups	(%KEYP), %KEY			// key
 #ifdef	OPENSSL_INTERFACE
 	mov	240(%KEYP), %NROUNDS32		// round count
 #else	/* OpenSolaris Interface */
 	/* Round count is already present as P2 in %rsi/%esi */
 #endif	/* OPENSSL_INTERFACE */
 
 	pxor	%KEY, %STATE			// round 0
 	lea	0x30(%KEYP), %KEYP
 	cmp	$12, %NROUNDS
 	jb	.Ldec128
 	lea	0x20(%KEYP), %KEYP
 	je	.Ldec192
 
 	// AES 256
 	lea	0x20(%KEYP), %KEYP
 	movups	-0x60(%KEYP), %KEY
 	aesdec	%KEY, %STATE
 	movups	-0x50(%KEYP), %KEY
 	aesdec	%KEY, %STATE
 
-.align 4
+.balign 4
 .Ldec192:
 	// AES 192 and 256
 	movups	-0x40(%KEYP), %KEY
 	aesdec	%KEY, %STATE
 	movups	-0x30(%KEYP), %KEY
 	aesdec	%KEY, %STATE
 
-.align 4
+.balign 4
 .Ldec128:
 	// AES 128, 192, and 256
 	movups	-0x20(%KEYP), %KEY
 	aesdec	%KEY, %STATE
 	movups	-0x10(%KEYP), %KEY
 	aesdec	%KEY, %STATE
 	movups	(%KEYP), %KEY
 	aesdec	%KEY, %STATE
 	movups	0x10(%KEYP), %KEY
 	aesdec	%KEY, %STATE
 	movups	0x20(%KEYP), %KEY
 	aesdec	%KEY, %STATE
 	movups	0x30(%KEYP), %KEY
 	aesdec	%KEY, %STATE
 	movups	0x40(%KEYP), %KEY
 	aesdec	%KEY, %STATE
 	movups	0x50(%KEYP), %KEY
 	aesdec	%KEY, %STATE
 	movups	0x60(%KEYP), %KEY
 	aesdec	%KEY, %STATE
 	movups	0x70(%KEYP), %KEY
 	aesdeclast	%KEY, %STATE		// last round
 	movups	%STATE, (%OUTP)			// output
 
 	RET
 	SET_SIZE(aes_decrypt_intel)
 
 #endif	/* lint || __lint */
 
 #ifdef __ELF__
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/module/icp/asm-x86_64/aes/aes_amd64.S b/module/icp/asm-x86_64/aes/aes_amd64.S
index d5cf4040fb93..c4870a28ead6 100644
--- a/module/icp/asm-x86_64/aes/aes_amd64.S
+++ b/module/icp/asm-x86_64/aes/aes_amd64.S
@@ -1,908 +1,908 @@
 /*
  * ---------------------------------------------------------------------------
  * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
  *
  * LICENSE TERMS
  *
  * The free distribution and use of this software is allowed (with or without
  * changes) provided that:
  *
  *  1. source code distributions include the above copyright notice, this
  *     list of conditions and the following disclaimer;
  *
  *  2. binary distributions include the above copyright notice, this list
  *     of conditions and the following disclaimer in their documentation;
  *
  *  3. the name of the copyright holder is not used to endorse products
  *     built using this software without specific written permission.
  *
  * DISCLAIMER
  *
  * This software is provided 'as is' with no explicit or implied warranties
  * in respect of its properties, including, but not limited to, correctness
  * and/or fitness for purpose.
  * ---------------------------------------------------------------------------
  * Issue 20/12/2007
  *
  * I am grateful to Dag Arne Osvik for many discussions of the techniques that
  * can be used to optimise AES assembler code on AMD64/EM64T architectures.
  * Some of the techniques used in this implementation are the result of
  * suggestions made by him for which I am most grateful.
  *
  * An AES implementation for AMD64 processors using the YASM assembler.  This
  * implementation provides only encryption, decryption and hence requires key
  * scheduling support in C. It uses 8k bytes of tables but its encryption and
  * decryption performance is very close to that obtained using large tables.
  * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions,
  * which are as follows:
  *               ms windows  gnu/linux/opensolaris os
  *
  *   in_blk          rcx     rdi
  *   out_blk         rdx     rsi
  *   context (cx)     r8     rdx
  *
  *   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
  *   registers       rdi      -      on both
  *
  *   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
  *   registers        -      rdi     on both
  *
  * The convention used here is that for gnu/linux/opensolaris os.
  *
  * This code provides the standard AES block size (128 bits, 16 bytes) and the
  * three standard AES key sizes (128, 192 and 256 bits). It has the same call
  * interface as my C implementation.  It uses the Microsoft C AMD64 calling
  * conventions in which the three parameters are placed in  rcx, rdx and r8
  * respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
  *
  * OpenSolaris Note:
  * Modified to use GNU/Linux/Solaris calling conventions.
  * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively.
  *
  *     AES_RETURN aes_encrypt(const unsigned char in_blk[],
  *                   unsigned char out_blk[], const aes_encrypt_ctx cx[1])/
  *
  *     AES_RETURN aes_decrypt(const unsigned char in_blk[],
  *                   unsigned char out_blk[], const aes_decrypt_ctx cx[1])/
  *
  *     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
  *                                            const aes_encrypt_ctx cx[1])/
  *
  *     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
  *                                            const aes_decrypt_ctx cx[1])/
  *
  *     AES_RETURN aes_encrypt_key(const unsigned char key[],
  *                           unsigned int len, const aes_decrypt_ctx cx[1])/
  *
  *     AES_RETURN aes_decrypt_key(const unsigned char key[],
  *                           unsigned int len, const aes_decrypt_ctx cx[1])/
  *
  * where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
  * either bits or bytes.
  *
  * Comment in/out the following lines to obtain the desired subroutines. These
  * selections MUST match those in the C header file aesopt.h
  */
 #define	AES_REV_DKS	  /* define if key decryption schedule is reversed */
 
 #define	LAST_ROUND_TABLES /* define for the faster version using extra tables */
 
 /*
  * The encryption key schedule has the following in memory layout where N is the
  * number of rounds (10, 12 or 14):
  *
  * lo: | input key (round 0)  |  / each round is four 32-bit words
  *     | encryption round 1   |
  *     | encryption round 2   |
  *     ....
  *     | encryption round N-1 |
  * hi: | encryption round N   |
  *
  * The decryption key schedule is normally set up so that it has the same
  * layout as above by actually reversing the order of the encryption key
  * schedule in memory (this happens when AES_REV_DKS is set):
  *
  * lo: | decryption round 0   | =              | encryption round N   |
  *     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
  *     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
  *     ....                       ....
  *     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
  * hi: | decryption round N   | =              | input key (round 0)  |
  *
  * with rounds except the first and last modified using inv_mix_column()
  * But if AES_REV_DKS is NOT set the order of keys is left as it is for
  * encryption so that it has to be accessed in reverse when used for
  * decryption (although the inverse mix column modifications are done)
  *
  * lo: | decryption round 0   | =              | input key (round 0)  |
  *     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
  *     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
  *     ....                       ....
  *     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
  * hi: | decryption round N   | =              | encryption round N   |
  *
  * This layout is faster when the assembler key scheduling provided here
  * is used.
  *
  * End of user defines
  */
 
 /*
  * ---------------------------------------------------------------------------
  * OpenSolaris OS modifications
  *
  * This source originates from Brian Gladman file aes_amd64.asm
  * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip
  * with these changes:
  *
  * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and
  * !__GNUC__ ifdefs.  Also removed ENCRYPTION, DECRYPTION,
  * AES_128, AES_192, AES_256, AES_VAR ifdefs.
  *
  * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define
  *
  * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef
  *
  * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax
  * (operands reversed, literals prefixed with "$", registers prefixed with "%",
  * and "[register+offset]", addressing changed to "offset(register)",
  * parenthesis in constant expressions "()" changed to square brackets "[]",
  * "." removed from  local (numeric) labels, and other changes.
  * Examples:
  * Intel/yasm/nasm Syntax	ATT/OpenSolaris Syntax
  * mov	rax,(4*20h)		mov	$[4*0x20],%rax
  * mov	rax,[ebx+20h]		mov	0x20(%ebx),%rax
  * lea	rax,[ebx+ecx]		lea	(%ebx,%ecx),%rax
  * sub	rax,[ebx+ecx*4-20h]	sub	-0x20(%ebx,%ecx,4),%rax
  *
  * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
  * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
  * definitions for lint.
  *
  * 6. Renamed functions and reordered parameters to match OpenSolaris:
  * Original Gladman interface:
  *	int aes_encrypt(const unsigned char *in,
  *		unsigned char *out, const aes_encrypt_ctx cx[1])/
  *	int aes_decrypt(const unsigned char *in,
  *		unsigned char *out, const aes_encrypt_ctx cx[1])/
  * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t,
  * and a union type, inf., containing inf.l, a uint32_t and
  * inf.b, a 4-element array of uint32_t.  Only b[0] in the array (aka "l") is
  * used and contains the key schedule length * 16 where key schedule length is
  * 10, 12, or 14 bytes.
  *
  * OpenSolaris OS interface:
  *	void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
  *		const uint32_t pt[4], uint32_t ct[4])/
  *	void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
  *		const uint32_t pt[4], uint32_t ct[4])/
  *	typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/
  *		 uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/
  * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
  * ct is crypto text, and MAX_AES_NR is 14.
  * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
  */
 
 #if defined(lint) || defined(__lint)
 
 #include <sys/types.h>
 void
 aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4],
        uint32_t ct[4]) {
 		(void) rk, (void) Nr, (void) pt, (void) ct;
 }
 void
 aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4],
        uint32_t pt[4]) {
 		(void) rk, (void) Nr, (void) pt, (void) ct;
 }
 
 
 #else
 
 #define _ASM
 #include <sys/asm_linkage.h>
 
 #define	KS_LENGTH	60
 
 #define	raxd		eax
 #define	rdxd		edx
 #define	rcxd		ecx
 #define	rbxd		ebx
 #define	rsid		esi
 #define	rdid		edi
 
 #define	raxb		al
 #define	rdxb		dl
 #define	rcxb		cl
 #define	rbxb		bl
 #define	rsib		sil
 #define	rdib		dil
 
 // finite field multiplies by {02}, {04} and {08}
 
 #define	f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
 #define	f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
 #define	f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
 
 // finite field multiplies required in table generation
 
 #define	f3(x) ((f2(x)) ^ (x))
 #define	f9(x) ((f8(x)) ^ (x))
 #define	fb(x) ((f8(x)) ^ (f2(x)) ^ (x))
 #define	fd(x) ((f8(x)) ^ (f4(x)) ^ (x))
 #define	fe(x) ((f8(x)) ^ (f4(x)) ^ (f2(x)))
 
 // macros for expanding S-box data
 
 #define	u8(x) (f2(x)), (x), (x), (f3(x)), (f2(x)), (x), (x), (f3(x))
 #define	v8(x) (fe(x)), (f9(x)), (fd(x)), (fb(x)), (fe(x)), (f9(x)), (fd(x)), (x)
 #define	w8(x) (x), 0, 0, 0, (x), 0, 0, 0
 
 #define	enc_vals(x)	\
    .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \
    .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \
    .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \
    .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \
    .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \
    .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \
    .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \
    .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \
    .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \
    .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \
    .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \
    .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \
    .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \
    .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \
    .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \
    .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \
    .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \
    .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \
    .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \
    .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \
    .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \
    .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \
    .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \
    .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \
    .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \
    .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \
    .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \
    .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \
    .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \
    .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \
    .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \
    .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16)
 
 #define	dec_vals(x) \
    .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \
    .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \
    .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \
    .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \
    .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \
    .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \
    .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \
    .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \
    .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \
    .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \
    .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \
    .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \
    .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \
    .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \
    .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \
    .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \
    .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \
    .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \
    .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \
    .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \
    .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \
    .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \
    .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \
    .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \
    .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \
    .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \
    .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \
    .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \
    .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \
    .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \
    .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \
    .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d)
 
 #define	tptr	%rbp	/* table pointer */
 #define	kptr	%r8	/* key schedule pointer */
 #define	fofs	128	/* adjust offset in key schedule to keep |disp| < 128 */
 #define	fk_ref(x, y)	-16*x+fofs+4*y(kptr)
 
 #ifdef	AES_REV_DKS
 #define	rofs		128
 #define	ik_ref(x, y)	-16*x+rofs+4*y(kptr)
 
 #else
 #define	rofs		-128
 #define	ik_ref(x, y)	16*x+rofs+4*y(kptr)
 #endif	/* AES_REV_DKS */
 
 #define	tab_0(x)	(tptr,x,8)
 #define	tab_1(x)	3(tptr,x,8)
 #define	tab_2(x)	2(tptr,x,8)
 #define	tab_3(x)	1(tptr,x,8)
 #define	tab_f(x)	1(tptr,x,8)
 #define	tab_i(x)	7(tptr,x,8)
 
 #define	ff_rnd(p1, p2, p3, p4, round)	/* normal forward round */ \
 	mov	fk_ref(round,0), p1; \
 	mov	fk_ref(round,1), p2; \
 	mov	fk_ref(round,2), p3; \
 	mov	fk_ref(round,3), p4; \
  \
 	movzx	%al, %esi; \
 	movzx	%ah, %edi; \
 	shr	$16, %eax; \
 	xor	tab_0(%rsi), p1; \
 	xor	tab_1(%rdi), p4; \
 	movzx	%al, %esi; \
 	movzx	%ah, %edi; \
 	xor	tab_2(%rsi), p3; \
 	xor	tab_3(%rdi), p2; \
  \
 	movzx	%bl, %esi; \
 	movzx	%bh, %edi; \
 	shr	$16, %ebx; \
 	xor	tab_0(%rsi), p2; \
 	xor	tab_1(%rdi), p1; \
 	movzx	%bl, %esi; \
 	movzx	%bh, %edi; \
 	xor	tab_2(%rsi), p4; \
 	xor	tab_3(%rdi), p3; \
  \
 	movzx	%cl, %esi; \
 	movzx	%ch, %edi; \
 	shr	$16, %ecx; \
 	xor	tab_0(%rsi), p3; \
 	xor	tab_1(%rdi), p2; \
 	movzx	%cl, %esi; \
 	movzx	%ch, %edi; \
 	xor	tab_2(%rsi), p1; \
 	xor	tab_3(%rdi), p4; \
  \
 	movzx	%dl, %esi; \
 	movzx	%dh, %edi; \
 	shr	$16, %edx; \
 	xor	tab_0(%rsi), p4; \
 	xor	tab_1(%rdi), p3; \
 	movzx	%dl, %esi; \
 	movzx	%dh, %edi; \
 	xor	tab_2(%rsi), p2; \
 	xor	tab_3(%rdi), p1; \
  \
 	mov	p1, %eax; \
 	mov	p2, %ebx; \
 	mov	p3, %ecx; \
 	mov	p4, %edx
 
 #ifdef	LAST_ROUND_TABLES
 
 #define	fl_rnd(p1, p2, p3, p4, round)	/* last forward round */ \
 	add	$2048, tptr; \
 	mov	fk_ref(round,0), p1; \
 	mov	fk_ref(round,1), p2; \
 	mov	fk_ref(round,2), p3; \
 	mov	fk_ref(round,3), p4; \
  \
 	movzx	%al, %esi; \
 	movzx	%ah, %edi; \
 	shr	$16, %eax; \
 	xor	tab_0(%rsi), p1; \
 	xor	tab_1(%rdi), p4; \
 	movzx	%al, %esi; \
 	movzx	%ah, %edi; \
 	xor	tab_2(%rsi), p3; \
 	xor	tab_3(%rdi), p2; \
  \
 	movzx	%bl, %esi; \
 	movzx	%bh, %edi; \
 	shr	$16, %ebx; \
 	xor	tab_0(%rsi), p2; \
 	xor	tab_1(%rdi), p1; \
 	movzx	%bl, %esi; \
 	movzx	%bh, %edi; \
 	xor	tab_2(%rsi), p4; \
 	xor	tab_3(%rdi), p3; \
  \
 	movzx	%cl, %esi; \
 	movzx	%ch, %edi; \
 	shr	$16, %ecx; \
 	xor	tab_0(%rsi), p3; \
 	xor	tab_1(%rdi), p2; \
 	movzx	%cl, %esi; \
 	movzx	%ch, %edi; \
 	xor	tab_2(%rsi), p1; \
 	xor	tab_3(%rdi), p4; \
  \
 	movzx	%dl, %esi; \
 	movzx	%dh, %edi; \
 	shr	$16, %edx; \
 	xor	tab_0(%rsi), p4; \
 	xor	tab_1(%rdi), p3; \
 	movzx	%dl, %esi; \
 	movzx	%dh, %edi; \
 	xor	tab_2(%rsi), p2; \
 	xor	tab_3(%rdi), p1
 
 #else
 
 #define	fl_rnd(p1, p2, p3, p4, round)	/* last forward round */ \
 	mov	fk_ref(round,0), p1; \
 	mov	fk_ref(round,1), p2; \
 	mov	fk_ref(round,2), p3; \
 	mov	fk_ref(round,3), p4; \
  \
 	movzx	%al, %esi; \
 	movzx	%ah, %edi; \
 	shr	$16, %eax; \
 	movzx	tab_f(%rsi), %esi; \
 	movzx	tab_f(%rdi), %edi; \
 	xor	%esi, p1; \
 	rol	$8, %edi; \
 	xor	%edi, p4; \
 	movzx	%al, %esi; \
 	movzx	%ah, %edi; \
 	movzx	tab_f(%rsi), %esi; \
 	movzx	tab_f(%rdi), %edi; \
 	rol	$16, %esi; \
 	rol	$24, %edi; \
 	xor	%esi, p3; \
 	xor	%edi, p2; \
  \
 	movzx	%bl, %esi; \
 	movzx	%bh, %edi; \
 	shr	$16, %ebx; \
 	movzx	tab_f(%rsi), %esi; \
 	movzx	tab_f(%rdi), %edi; \
 	xor	%esi, p2; \
 	rol	$8, %edi; \
 	xor	%edi, p1; \
 	movzx	%bl, %esi; \
 	movzx	%bh, %edi; \
 	movzx	tab_f(%rsi), %esi; \
 	movzx	tab_f(%rdi), %edi; \
 	rol	$16, %esi; \
 	rol	$24, %edi; \
 	xor	%esi, p4; \
 	xor	%edi, p3; \
  \
 	movzx	%cl, %esi; \
 	movzx	%ch, %edi; \
 	movzx	tab_f(%rsi), %esi; \
 	movzx	tab_f(%rdi), %edi; \
 	shr	$16, %ecx; \
 	xor	%esi, p3; \
 	rol	$8, %edi; \
 	xor	%edi, p2; \
 	movzx	%cl, %esi; \
 	movzx	%ch, %edi; \
 	movzx	tab_f(%rsi), %esi; \
 	movzx	tab_f(%rdi), %edi; \
 	rol	$16, %esi; \
 	rol	$24, %edi; \
 	xor	%esi, p1; \
 	xor	%edi, p4; \
  \
 	movzx	%dl, %esi; \
 	movzx	%dh, %edi; \
 	movzx	tab_f(%rsi), %esi; \
 	movzx	tab_f(%rdi), %edi; \
 	shr	$16, %edx; \
 	xor	%esi, p4; \
 	rol	$8, %edi; \
 	xor	%edi, p3; \
 	movzx	%dl, %esi; \
 	movzx	%dh, %edi; \
 	movzx	tab_f(%rsi), %esi; \
 	movzx	tab_f(%rdi), %edi; \
 	rol	$16, %esi; \
 	rol	$24, %edi; \
 	xor	%esi, p2; \
 	xor	%edi, p1
 
 #endif	/* LAST_ROUND_TABLES */
 
 #define	ii_rnd(p1, p2, p3, p4, round)	/* normal inverse round */ \
 	mov	ik_ref(round,0), p1; \
 	mov	ik_ref(round,1), p2; \
 	mov	ik_ref(round,2), p3; \
 	mov	ik_ref(round,3), p4; \
  \
 	movzx	%al, %esi; \
 	movzx	%ah, %edi; \
 	shr	$16, %eax; \
 	xor	tab_0(%rsi), p1; \
 	xor	tab_1(%rdi), p2; \
 	movzx	%al, %esi; \
 	movzx	%ah, %edi; \
 	xor	tab_2(%rsi), p3; \
 	xor	tab_3(%rdi), p4; \
  \
 	movzx	%bl, %esi; \
 	movzx	%bh, %edi; \
 	shr	$16, %ebx; \
 	xor	tab_0(%rsi), p2; \
 	xor	tab_1(%rdi), p3; \
 	movzx	%bl, %esi; \
 	movzx	%bh, %edi; \
 	xor	tab_2(%rsi), p4; \
 	xor	tab_3(%rdi), p1; \
  \
 	movzx	%cl, %esi; \
 	movzx	%ch, %edi; \
 	shr	$16, %ecx; \
 	xor	tab_0(%rsi), p3; \
 	xor	tab_1(%rdi), p4; \
 	movzx	%cl, %esi; \
 	movzx	%ch, %edi; \
 	xor	tab_2(%rsi), p1; \
 	xor	tab_3(%rdi), p2; \
  \
 	movzx	%dl, %esi; \
 	movzx	%dh, %edi; \
 	shr	$16, %edx; \
 	xor	tab_0(%rsi), p4; \
 	xor	tab_1(%rdi), p1; \
 	movzx	%dl, %esi; \
 	movzx	%dh, %edi; \
 	xor	tab_2(%rsi), p2; \
 	xor	tab_3(%rdi), p3; \
  \
 	mov	p1, %eax; \
 	mov	p2, %ebx; \
 	mov	p3, %ecx; \
 	mov	p4, %edx
 
 #ifdef	LAST_ROUND_TABLES
 
 #define	il_rnd(p1, p2, p3, p4, round)	/* last inverse round */ \
 	add	$2048, tptr; \
 	mov	ik_ref(round,0), p1; \
 	mov	ik_ref(round,1), p2; \
 	mov	ik_ref(round,2), p3; \
 	mov	ik_ref(round,3), p4; \
  \
 	movzx	%al, %esi; \
 	movzx	%ah, %edi; \
 	shr	$16, %eax; \
 	xor	tab_0(%rsi), p1; \
 	xor	tab_1(%rdi), p2; \
 	movzx	%al, %esi; \
 	movzx	%ah, %edi; \
 	xor	tab_2(%rsi), p3; \
 	xor	tab_3(%rdi), p4; \
  \
 	movzx	%bl, %esi; \
 	movzx	%bh, %edi; \
 	shr	$16, %ebx; \
 	xor	tab_0(%rsi), p2; \
 	xor	tab_1(%rdi), p3; \
 	movzx	%bl, %esi; \
 	movzx	%bh, %edi; \
 	xor	tab_2(%rsi), p4; \
 	xor	tab_3(%rdi), p1; \
  \
 	movzx	%cl, %esi; \
 	movzx	%ch, %edi; \
 	shr	$16, %ecx; \
 	xor	tab_0(%rsi), p3; \
 	xor	tab_1(%rdi), p4; \
 	movzx	%cl, %esi; \
 	movzx	%ch, %edi; \
 	xor	tab_2(%rsi), p1; \
 	xor	tab_3(%rdi), p2; \
  \
 	movzx	%dl, %esi; \
 	movzx	%dh, %edi; \
 	shr	$16, %edx; \
 	xor	tab_0(%rsi), p4; \
 	xor	tab_1(%rdi), p1; \
 	movzx	%dl, %esi; \
 	movzx	%dh, %edi; \
 	xor	tab_2(%rsi), p2; \
 	xor	tab_3(%rdi), p3
 
 #else
 
 #define	il_rnd(p1, p2, p3, p4, round)	/* last inverse round */ \
 	mov	ik_ref(round,0), p1; \
 	mov	ik_ref(round,1), p2; \
 	mov	ik_ref(round,2), p3; \
 	mov	ik_ref(round,3), p4; \
  \
 	movzx	%al, %esi; \
 	movzx	%ah, %edi; \
 	movzx	tab_i(%rsi), %esi; \
 	movzx	tab_i(%rdi), %edi; \
 	shr	$16, %eax; \
 	xor	%esi, p1; \
 	rol	$8, %edi; \
 	xor	%edi, p2; \
 	movzx	%al, %esi; \
 	movzx	%ah, %edi; \
 	movzx	tab_i(%rsi), %esi; \
 	movzx	tab_i(%rdi), %edi; \
 	rol	$16, %esi; \
 	rol	$24, %edi; \
 	xor	%esi, p3; \
 	xor	%edi, p4; \
  \
 	movzx	%bl, %esi; \
 	movzx	%bh, %edi; \
 	movzx	tab_i(%rsi), %esi; \
 	movzx	tab_i(%rdi), %edi; \
 	shr	$16, %ebx; \
 	xor	%esi, p2; \
 	rol	$8, %edi; \
 	xor	%edi, p3; \
 	movzx	%bl, %esi; \
 	movzx	%bh, %edi; \
 	movzx	tab_i(%rsi), %esi; \
 	movzx	tab_i(%rdi), %edi; \
 	rol	$16, %esi; \
 	rol	$24, %edi; \
 	xor	%esi, p4; \
 	xor	%edi, p1; \
  \
 	movzx	%cl, %esi; \
 	movzx	%ch, %edi; \
 	movzx	tab_i(%rsi), %esi; \
 	movzx	tab_i(%rdi), %edi; \
 	shr	$16, %ecx; \
 	xor	%esi, p3; \
 	rol	$8, %edi; \
 	xor	%edi, p4; \
 	movzx	%cl, %esi; \
 	movzx	%ch, %edi; \
 	movzx	tab_i(%rsi), %esi; \
 	movzx	tab_i(%rdi), %edi; \
 	rol	$16, %esi; \
 	rol	$24, %edi; \
 	xor	%esi, p1; \
 	xor	%edi, p2; \
  \
 	movzx	%dl, %esi; \
 	movzx	%dh, %edi; \
 	movzx	tab_i(%rsi), %esi; \
 	movzx	tab_i(%rdi), %edi; \
 	shr	$16, %edx; \
 	xor	%esi, p4; \
 	rol	$8, %edi; \
 	xor	%edi, p1; \
 	movzx	%dl, %esi; \
 	movzx	%dh, %edi; \
 	movzx	tab_i(%rsi), %esi; \
 	movzx	tab_i(%rdi), %edi; \
 	rol	$16, %esi; \
 	rol	$24, %edi; \
 	xor	%esi, p2; \
 	xor	%edi, p3
 
 #endif	/* LAST_ROUND_TABLES */
 
 /*
  * OpenSolaris OS:
  * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
  *	const uint32_t pt[4], uint32_t ct[4])/
  *
  * Original interface:
  * int aes_encrypt(const unsigned char *in,
  *	unsigned char *out, const aes_encrypt_ctx cx[1])/
  */
 SECTION_STATIC
-.align	64
+.balign	64
 enc_tab:
 	enc_vals(u8)
 #ifdef	LAST_ROUND_TABLES
 	// Last Round Tables:
 	enc_vals(w8)
 #endif
 
 
 ENTRY_NP(aes_encrypt_amd64)
 	ENDBR
 #ifdef	GLADMAN_INTERFACE
 	// Original interface
 	sub	$[4*8], %rsp	// gnu/linux/opensolaris binary interface
 	mov	%rsi, (%rsp)	// output pointer (P2)
 	mov	%rdx, %r8	// context (P3)
 
 	mov	%rbx, 1*8(%rsp)	// P1: input pointer in rdi
 	mov	%rbp, 2*8(%rsp)	// P2: output pointer in (rsp)
 	mov	%r12, 3*8(%rsp)	// P3: context in r8
 	movzx	4*KS_LENGTH(kptr), %esi	// Get byte key length * 16
 
 #else
 	// OpenSolaris OS interface
 	sub	$(4*8), %rsp	// Make room on stack to save registers
 	mov	%rcx, (%rsp)	// Save output pointer (P4) on stack
 	mov	%rdi, %r8	// context (P1)
 	mov	%rdx, %rdi	// P3: save input pointer
 	shl	$4, %esi	// P2: esi byte key length * 16
 
 	mov	%rbx, 1*8(%rsp)	// Save registers
 	mov	%rbp, 2*8(%rsp)
 	mov	%r12, 3*8(%rsp)
 	// P1: context in r8
 	// P2: byte key length * 16 in esi
 	// P3: input pointer in rdi
 	// P4: output pointer in (rsp)
 #endif	/* GLADMAN_INTERFACE */
 
 	lea	enc_tab(%rip), tptr
 	sub	$fofs, kptr
 
 	// Load input block into registers
 	mov	(%rdi), %eax
 	mov	1*4(%rdi), %ebx
 	mov	2*4(%rdi), %ecx
 	mov	3*4(%rdi), %edx
 
 	xor	fofs(kptr), %eax
 	xor	fofs+4(kptr), %ebx
 	xor	fofs+8(kptr), %ecx
 	xor	fofs+12(kptr), %edx
 
 	lea	(kptr,%rsi), kptr
 	// Jump based on byte key length * 16:
 	cmp	$(10*16), %esi
 	je	3f
 	cmp	$(12*16), %esi
 	je	2f
 	cmp	$(14*16), %esi
 	je	1f
 	mov	$-1, %rax	// error
 	jmp	4f
 
 	// Perform normal forward rounds
 1:	ff_rnd(%r9d, %r10d, %r11d, %r12d, 13)
 	ff_rnd(%r9d, %r10d, %r11d, %r12d, 12)
 2:	ff_rnd(%r9d, %r10d, %r11d, %r12d, 11)
 	ff_rnd(%r9d, %r10d, %r11d, %r12d, 10)
 3:	ff_rnd(%r9d, %r10d, %r11d, %r12d,  9)
 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  8)
 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  7)
 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  6)
 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  5)
 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  4)
 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  3)
 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  2)
 	ff_rnd(%r9d, %r10d, %r11d, %r12d,  1)
 	fl_rnd(%r9d, %r10d, %r11d, %r12d,  0)
 
 	// Copy results
 	mov	(%rsp), %rbx
 	mov	%r9d, (%rbx)
 	mov	%r10d, 4(%rbx)
 	mov	%r11d, 8(%rbx)
 	mov	%r12d, 12(%rbx)
 	xor	%rax, %rax
 4:	// Restore registers
 	mov	1*8(%rsp), %rbx
 	mov	2*8(%rsp), %rbp
 	mov	3*8(%rsp), %r12
 	add	$(4*8), %rsp
 	RET
 
 	SET_SIZE(aes_encrypt_amd64)
 
 /*
  * OpenSolaris OS:
  * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
  *	const uint32_t pt[4], uint32_t ct[4])/
  *
  * Original interface:
  * int aes_decrypt(const unsigned char *in,
  *	unsigned char *out, const aes_encrypt_ctx cx[1])/
  */
 SECTION_STATIC
-.align	64
+.balign	64
 dec_tab:
 	dec_vals(v8)
 #ifdef	LAST_ROUND_TABLES
 	// Last Round Tables:
 	dec_vals(w8)
 #endif
 
 
 ENTRY_NP(aes_decrypt_amd64)
 	ENDBR
 #ifdef	GLADMAN_INTERFACE
 	// Original interface
 	sub	$[4*8], %rsp	// gnu/linux/opensolaris binary interface
 	mov	%rsi, (%rsp)	// output pointer (P2)
 	mov	%rdx, %r8	// context (P3)
 
 	mov	%rbx, 1*8(%rsp)	// P1: input pointer in rdi
 	mov	%rbp, 2*8(%rsp)	// P2: output pointer in (rsp)
 	mov	%r12, 3*8(%rsp)	// P3: context in r8
 	movzx	4*KS_LENGTH(kptr), %esi	// Get byte key length * 16
 
 #else
 	// OpenSolaris OS interface
 	sub	$(4*8), %rsp	// Make room on stack to save registers
 	mov	%rcx, (%rsp)	// Save output pointer (P4) on stack
 	mov	%rdi, %r8	// context (P1)
 	mov	%rdx, %rdi	// P3: save input pointer
 	shl	$4, %esi	// P2: esi byte key length * 16
 
 	mov	%rbx, 1*8(%rsp)	// Save registers
 	mov	%rbp, 2*8(%rsp)
 	mov	%r12, 3*8(%rsp)
 	// P1: context in r8
 	// P2: byte key length * 16 in esi
 	// P3: input pointer in rdi
 	// P4: output pointer in (rsp)
 #endif	/* GLADMAN_INTERFACE */
 
 	lea	dec_tab(%rip), tptr
 	sub	$rofs, kptr
 
 	// Load input block into registers
 	mov	(%rdi), %eax
 	mov	1*4(%rdi), %ebx
 	mov	2*4(%rdi), %ecx
 	mov	3*4(%rdi), %edx
 
 #ifdef AES_REV_DKS
 	mov	kptr, %rdi
 	lea	(kptr,%rsi), kptr
 #else
 	lea	(kptr,%rsi), %rdi
 #endif
 
 	xor	rofs(%rdi), %eax
 	xor	rofs+4(%rdi), %ebx
 	xor	rofs+8(%rdi), %ecx
 	xor	rofs+12(%rdi), %edx
 
 	// Jump based on byte key length * 16:
 	cmp	$(10*16), %esi
 	je	3f
 	cmp	$(12*16), %esi
 	je	2f
 	cmp	$(14*16), %esi
 	je	1f
 	mov	$-1, %rax	// error
 	jmp	4f
 
 	// Perform normal inverse rounds
 1:	ii_rnd(%r9d, %r10d, %r11d, %r12d, 13)
 	ii_rnd(%r9d, %r10d, %r11d, %r12d, 12)
 2:	ii_rnd(%r9d, %r10d, %r11d, %r12d, 11)
 	ii_rnd(%r9d, %r10d, %r11d, %r12d, 10)
 3:	ii_rnd(%r9d, %r10d, %r11d, %r12d,  9)
 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  8)
 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  7)
 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  6)
 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  5)
 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  4)
 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  3)
 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  2)
 	ii_rnd(%r9d, %r10d, %r11d, %r12d,  1)
 	il_rnd(%r9d, %r10d, %r11d, %r12d,  0)
 
 	// Copy results
 	mov	(%rsp), %rbx
 	mov	%r9d, (%rbx)
 	mov	%r10d, 4(%rbx)
 	mov	%r11d, 8(%rbx)
 	mov	%r12d, 12(%rbx)
 	xor	%rax, %rax
 4:	// Restore registers
 	mov	1*8(%rsp), %rbx
 	mov	2*8(%rsp), %rbp
 	mov	3*8(%rsp), %r12
 	add	$(4*8), %rsp
 	RET
 
 	SET_SIZE(aes_decrypt_amd64)
 #endif /* lint || __lint */
 
 #ifdef __ELF__
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
index 75dd2c721f56..165492a0ed76 100644
--- a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
+++ b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
@@ -1,1259 +1,1259 @@
 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the Apache License 2.0 (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
 # in the file LICENSE in the source distribution or at
 # https://www.openssl.org/source/license.html
 
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 #
 # AES-NI-CTR+GHASH stitch.
 #
 # February 2013
 #
 # OpenSSL GCM implementation is organized in such way that its
 # performance is rather close to the sum of its streamed components,
 # in the context parallelized AES-NI CTR and modulo-scheduled
 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
 # was observed to perform significantly better than the sum of the
 # components on contemporary CPUs, the effort was deemed impossible to
 # justify. This module is based on combination of Intel submissions,
 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
 # pressure with notable relative improvement, achieving 1.0 cycle per
 # byte processed with 128-bit key on Haswell processor, 0.74 - on
 # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
 # measurements for favourable packet size, one divisible by 96.
 # Applications using the EVP interface will observe a few percent
 # worse performance.]
 #
 # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
 #
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
 
 # Generated once from
 # https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
 # and modified for ICP. Modification are kept at a bare minimum to ease later
 # upstream merges.
 
 #if defined(__x86_64__) && defined(HAVE_AVX) && \
     defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
 
 #define _ASM
 #include <sys/asm_linkage.h>
 
 /* Windows userland links with OpenSSL */
 #if !defined (_WIN32) || defined (_KERNEL)
 
 .extern gcm_avx_can_use_movbe
 
 .text
 
 #ifdef HAVE_MOVBE
-.align 32
+.balign 32
 FUNCTION(_aesni_ctr32_ghash_6x)
 .cfi_startproc
 	ENDBR
 	vmovdqu	32(%r11),%xmm2
 	subq	$6,%rdx
 	vpxor	%xmm4,%xmm4,%xmm4
 	vmovdqu	0-128(%rcx),%xmm15
 	vpaddb	%xmm2,%xmm1,%xmm10
 	vpaddb	%xmm2,%xmm10,%xmm11
 	vpaddb	%xmm2,%xmm11,%xmm12
 	vpaddb	%xmm2,%xmm12,%xmm13
 	vpaddb	%xmm2,%xmm13,%xmm14
 	vpxor	%xmm15,%xmm1,%xmm9
 	vmovdqu	%xmm4,16+8(%rsp)
 	jmp	.Loop6x
 
-.align	32
+.balign	32
 .Loop6x:
 	addl	$100663296,%ebx
 	jc	.Lhandle_ctr32
 	vmovdqu	0-32(%r9),%xmm3
 	vpaddb	%xmm2,%xmm14,%xmm1
 	vpxor	%xmm15,%xmm10,%xmm10
 	vpxor	%xmm15,%xmm11,%xmm11
 
 .Lresume_ctr32:
 	vmovdqu	%xmm1,(%r8)
 	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
 	vpxor	%xmm15,%xmm12,%xmm12
 	vmovups	16-128(%rcx),%xmm2
 	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
 	xorq	%r12,%r12
 	cmpq	%r14,%r15
 
 	vaesenc	%xmm2,%xmm9,%xmm9
 	vmovdqu	48+8(%rsp),%xmm0
 	vpxor	%xmm15,%xmm13,%xmm13
 	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
 	vaesenc	%xmm2,%xmm10,%xmm10
 	vpxor	%xmm15,%xmm14,%xmm14
 	setnc	%r12b
 	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
 	vaesenc	%xmm2,%xmm11,%xmm11
 	vmovdqu	16-32(%r9),%xmm3
 	negq	%r12
 	vaesenc	%xmm2,%xmm12,%xmm12
 	vpxor	%xmm5,%xmm6,%xmm6
 	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
 	vpxor	%xmm4,%xmm8,%xmm8
 	vaesenc	%xmm2,%xmm13,%xmm13
 	vpxor	%xmm5,%xmm1,%xmm4
 	andq	$0x60,%r12
 	vmovups	32-128(%rcx),%xmm15
 	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
 	vaesenc	%xmm2,%xmm14,%xmm14
 
 	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
 	leaq	(%r14,%r12,1),%r14
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	16+8(%rsp),%xmm8,%xmm8
 	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
 	vmovdqu	64+8(%rsp),%xmm0
 	vaesenc	%xmm15,%xmm10,%xmm10
 	movbeq	88(%r14),%r13
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movbeq	80(%r14),%r12
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r13,32+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	movq	%r12,40+8(%rsp)
 	vmovdqu	48-32(%r9),%xmm5
 	vaesenc	%xmm15,%xmm14,%xmm14
 
 	vmovups	48-128(%rcx),%xmm15
 	vpxor	%xmm1,%xmm6,%xmm6
 	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm2,%xmm6,%xmm6
 	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vpxor	%xmm3,%xmm7,%xmm7
 	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
 	vaesenc	%xmm15,%xmm11,%xmm11
 	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
 	vmovdqu	80+8(%rsp),%xmm0
 	vaesenc	%xmm15,%xmm12,%xmm12
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vpxor	%xmm1,%xmm4,%xmm4
 	vmovdqu	64-32(%r9),%xmm1
 	vaesenc	%xmm15,%xmm14,%xmm14
 
 	vmovups	64-128(%rcx),%xmm15
 	vpxor	%xmm2,%xmm6,%xmm6
 	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm3,%xmm6,%xmm6
 	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
 	vaesenc	%xmm15,%xmm10,%xmm10
 	movbeq	72(%r14),%r13
 	vpxor	%xmm5,%xmm7,%xmm7
 	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movbeq	64(%r14),%r12
 	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
 	vmovdqu	96+8(%rsp),%xmm0
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r13,48+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	movq	%r12,56+8(%rsp)
 	vpxor	%xmm2,%xmm4,%xmm4
 	vmovdqu	96-32(%r9),%xmm2
 	vaesenc	%xmm15,%xmm14,%xmm14
 
 	vmovups	80-128(%rcx),%xmm15
 	vpxor	%xmm3,%xmm6,%xmm6
 	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm5,%xmm6,%xmm6
 	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
 	vaesenc	%xmm15,%xmm10,%xmm10
 	movbeq	56(%r14),%r13
 	vpxor	%xmm1,%xmm7,%xmm7
 	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
 	vpxor	112+8(%rsp),%xmm8,%xmm8
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movbeq	48(%r14),%r12
 	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r13,64+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	movq	%r12,72+8(%rsp)
 	vpxor	%xmm3,%xmm4,%xmm4
 	vmovdqu	112-32(%r9),%xmm3
 	vaesenc	%xmm15,%xmm14,%xmm14
 
 	vmovups	96-128(%rcx),%xmm15
 	vpxor	%xmm5,%xmm6,%xmm6
 	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm1,%xmm6,%xmm6
 	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
 	vaesenc	%xmm15,%xmm10,%xmm10
 	movbeq	40(%r14),%r13
 	vpxor	%xmm2,%xmm7,%xmm7
 	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movbeq	32(%r14),%r12
 	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r13,80+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	movq	%r12,88+8(%rsp)
 	vpxor	%xmm5,%xmm6,%xmm6
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vpxor	%xmm1,%xmm6,%xmm6
 
 	vmovups	112-128(%rcx),%xmm15
 	vpslldq	$8,%xmm6,%xmm5
 	vpxor	%xmm2,%xmm4,%xmm4
 	vmovdqu	16(%r11),%xmm3
 
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm8,%xmm7,%xmm7
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vpxor	%xmm5,%xmm4,%xmm4
 	movbeq	24(%r14),%r13
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movbeq	16(%r14),%r12
 	vpalignr	$8,%xmm4,%xmm4,%xmm0
 	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
 	movq	%r13,96+8(%rsp)
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r12,104+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vmovups	128-128(%rcx),%xmm1
 	vaesenc	%xmm15,%xmm14,%xmm14
 
 	vaesenc	%xmm1,%xmm9,%xmm9
 	vmovups	144-128(%rcx),%xmm15
 	vaesenc	%xmm1,%xmm10,%xmm10
 	vpsrldq	$8,%xmm6,%xmm6
 	vaesenc	%xmm1,%xmm11,%xmm11
 	vpxor	%xmm6,%xmm7,%xmm7
 	vaesenc	%xmm1,%xmm12,%xmm12
 	vpxor	%xmm0,%xmm4,%xmm4
 	movbeq	8(%r14),%r13
 	vaesenc	%xmm1,%xmm13,%xmm13
 	movbeq	0(%r14),%r12
 	vaesenc	%xmm1,%xmm14,%xmm14
 	vmovups	160-128(%rcx),%xmm1
 	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
 	jb	.Lenc_tail
 
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vaesenc	%xmm15,%xmm11,%xmm11
 	vaesenc	%xmm15,%xmm12,%xmm12
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vaesenc	%xmm15,%xmm14,%xmm14
 
 	vaesenc	%xmm1,%xmm9,%xmm9
 	vaesenc	%xmm1,%xmm10,%xmm10
 	vaesenc	%xmm1,%xmm11,%xmm11
 	vaesenc	%xmm1,%xmm12,%xmm12
 	vaesenc	%xmm1,%xmm13,%xmm13
 	vmovups	176-128(%rcx),%xmm15
 	vaesenc	%xmm1,%xmm14,%xmm14
 	vmovups	192-128(%rcx),%xmm1
 	cmpl	$14,%ebp	// ICP does not zero key schedule.
 	jb	.Lenc_tail
 
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vaesenc	%xmm15,%xmm11,%xmm11
 	vaesenc	%xmm15,%xmm12,%xmm12
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vaesenc	%xmm15,%xmm14,%xmm14
 
 	vaesenc	%xmm1,%xmm9,%xmm9
 	vaesenc	%xmm1,%xmm10,%xmm10
 	vaesenc	%xmm1,%xmm11,%xmm11
 	vaesenc	%xmm1,%xmm12,%xmm12
 	vaesenc	%xmm1,%xmm13,%xmm13
 	vmovups	208-128(%rcx),%xmm15
 	vaesenc	%xmm1,%xmm14,%xmm14
 	vmovups	224-128(%rcx),%xmm1
 	jmp	.Lenc_tail
 
-.align	32
+.balign	32
 .Lhandle_ctr32:
 	vmovdqu	(%r11),%xmm0
 	vpshufb	%xmm0,%xmm1,%xmm6
 	vmovdqu	48(%r11),%xmm5
 	vpaddd	64(%r11),%xmm6,%xmm10
 	vpaddd	%xmm5,%xmm6,%xmm11
 	vmovdqu	0-32(%r9),%xmm3
 	vpaddd	%xmm5,%xmm10,%xmm12
 	vpshufb	%xmm0,%xmm10,%xmm10
 	vpaddd	%xmm5,%xmm11,%xmm13
 	vpshufb	%xmm0,%xmm11,%xmm11
 	vpxor	%xmm15,%xmm10,%xmm10
 	vpaddd	%xmm5,%xmm12,%xmm14
 	vpshufb	%xmm0,%xmm12,%xmm12
 	vpxor	%xmm15,%xmm11,%xmm11
 	vpaddd	%xmm5,%xmm13,%xmm1
 	vpshufb	%xmm0,%xmm13,%xmm13
 	vpshufb	%xmm0,%xmm14,%xmm14
 	vpshufb	%xmm0,%xmm1,%xmm1
 	jmp	.Lresume_ctr32
 
-.align	32
+.balign	32
 .Lenc_tail:
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vmovdqu	%xmm7,16+8(%rsp)
 	vpalignr	$8,%xmm4,%xmm4,%xmm8
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
 	vpxor	0(%rdi),%xmm1,%xmm2
 	vaesenc	%xmm15,%xmm11,%xmm11
 	vpxor	16(%rdi),%xmm1,%xmm0
 	vaesenc	%xmm15,%xmm12,%xmm12
 	vpxor	32(%rdi),%xmm1,%xmm5
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vpxor	48(%rdi),%xmm1,%xmm6
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vpxor	64(%rdi),%xmm1,%xmm7
 	vpxor	80(%rdi),%xmm1,%xmm3
 	vmovdqu	(%r8),%xmm1
 
 	vaesenclast	%xmm2,%xmm9,%xmm9
 	vmovdqu	32(%r11),%xmm2
 	vaesenclast	%xmm0,%xmm10,%xmm10
 	vpaddb	%xmm2,%xmm1,%xmm0
 	movq	%r13,112+8(%rsp)
 	leaq	96(%rdi),%rdi
 	vaesenclast	%xmm5,%xmm11,%xmm11
 	vpaddb	%xmm2,%xmm0,%xmm5
 	movq	%r12,120+8(%rsp)
 	leaq	96(%rsi),%rsi
 	vmovdqu	0-128(%rcx),%xmm15
 	vaesenclast	%xmm6,%xmm12,%xmm12
 	vpaddb	%xmm2,%xmm5,%xmm6
 	vaesenclast	%xmm7,%xmm13,%xmm13
 	vpaddb	%xmm2,%xmm6,%xmm7
 	vaesenclast	%xmm3,%xmm14,%xmm14
 	vpaddb	%xmm2,%xmm7,%xmm3
 
 	addq	$0x60,%r10
 	subq	$0x6,%rdx
 	jc	.L6x_done
 
 	vmovups	%xmm9,-96(%rsi)
 	vpxor	%xmm15,%xmm1,%xmm9
 	vmovups	%xmm10,-80(%rsi)
 	vmovdqa	%xmm0,%xmm10
 	vmovups	%xmm11,-64(%rsi)
 	vmovdqa	%xmm5,%xmm11
 	vmovups	%xmm12,-48(%rsi)
 	vmovdqa	%xmm6,%xmm12
 	vmovups	%xmm13,-32(%rsi)
 	vmovdqa	%xmm7,%xmm13
 	vmovups	%xmm14,-16(%rsi)
 	vmovdqa	%xmm3,%xmm14
 	vmovdqu	32+8(%rsp),%xmm7
 	jmp	.Loop6x
 
 .L6x_done:
 	vpxor	16+8(%rsp),%xmm8,%xmm8
 	vpxor	%xmm4,%xmm8,%xmm8
 
 	RET
 .cfi_endproc
 SET_SIZE(_aesni_ctr32_ghash_6x)
 #endif /* ifdef HAVE_MOVBE */
 
-.align 32
+.balign 32
 FUNCTION(_aesni_ctr32_ghash_no_movbe_6x)
 .cfi_startproc
 	ENDBR
 	vmovdqu	32(%r11),%xmm2
 	subq	$6,%rdx
 	vpxor	%xmm4,%xmm4,%xmm4
 	vmovdqu	0-128(%rcx),%xmm15
 	vpaddb	%xmm2,%xmm1,%xmm10
 	vpaddb	%xmm2,%xmm10,%xmm11
 	vpaddb	%xmm2,%xmm11,%xmm12
 	vpaddb	%xmm2,%xmm12,%xmm13
 	vpaddb	%xmm2,%xmm13,%xmm14
 	vpxor	%xmm15,%xmm1,%xmm9
 	vmovdqu	%xmm4,16+8(%rsp)
 	jmp	.Loop6x_nmb
 
-.align	32
+.balign	32
 .Loop6x_nmb:
 	addl	$100663296,%ebx
 	jc	.Lhandle_ctr32_nmb
 	vmovdqu	0-32(%r9),%xmm3
 	vpaddb	%xmm2,%xmm14,%xmm1
 	vpxor	%xmm15,%xmm10,%xmm10
 	vpxor	%xmm15,%xmm11,%xmm11
 
 .Lresume_ctr32_nmb:
 	vmovdqu	%xmm1,(%r8)
 	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
 	vpxor	%xmm15,%xmm12,%xmm12
 	vmovups	16-128(%rcx),%xmm2
 	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
 	xorq	%r12,%r12
 	cmpq	%r14,%r15
 
 	vaesenc	%xmm2,%xmm9,%xmm9
 	vmovdqu	48+8(%rsp),%xmm0
 	vpxor	%xmm15,%xmm13,%xmm13
 	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
 	vaesenc	%xmm2,%xmm10,%xmm10
 	vpxor	%xmm15,%xmm14,%xmm14
 	setnc	%r12b
 	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
 	vaesenc	%xmm2,%xmm11,%xmm11
 	vmovdqu	16-32(%r9),%xmm3
 	negq	%r12
 	vaesenc	%xmm2,%xmm12,%xmm12
 	vpxor	%xmm5,%xmm6,%xmm6
 	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
 	vpxor	%xmm4,%xmm8,%xmm8
 	vaesenc	%xmm2,%xmm13,%xmm13
 	vpxor	%xmm5,%xmm1,%xmm4
 	andq	$0x60,%r12
 	vmovups	32-128(%rcx),%xmm15
 	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
 	vaesenc	%xmm2,%xmm14,%xmm14
 
 	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
 	leaq	(%r14,%r12,1),%r14
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	16+8(%rsp),%xmm8,%xmm8
 	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
 	vmovdqu	64+8(%rsp),%xmm0
 	vaesenc	%xmm15,%xmm10,%xmm10
 	movq	88(%r14),%r13
 	bswapq	%r13
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movq	80(%r14),%r12
 	bswapq	%r12
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r13,32+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	movq	%r12,40+8(%rsp)
 	vmovdqu	48-32(%r9),%xmm5
 	vaesenc	%xmm15,%xmm14,%xmm14
 
 	vmovups	48-128(%rcx),%xmm15
 	vpxor	%xmm1,%xmm6,%xmm6
 	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm2,%xmm6,%xmm6
 	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vpxor	%xmm3,%xmm7,%xmm7
 	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
 	vaesenc	%xmm15,%xmm11,%xmm11
 	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
 	vmovdqu	80+8(%rsp),%xmm0
 	vaesenc	%xmm15,%xmm12,%xmm12
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vpxor	%xmm1,%xmm4,%xmm4
 	vmovdqu	64-32(%r9),%xmm1
 	vaesenc	%xmm15,%xmm14,%xmm14
 
 	vmovups	64-128(%rcx),%xmm15
 	vpxor	%xmm2,%xmm6,%xmm6
 	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm3,%xmm6,%xmm6
 	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
 	vaesenc	%xmm15,%xmm10,%xmm10
 	movq	72(%r14),%r13
 	bswapq	%r13
 	vpxor	%xmm5,%xmm7,%xmm7
 	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movq	64(%r14),%r12
 	bswapq	%r12
 	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
 	vmovdqu	96+8(%rsp),%xmm0
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r13,48+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	movq	%r12,56+8(%rsp)
 	vpxor	%xmm2,%xmm4,%xmm4
 	vmovdqu	96-32(%r9),%xmm2
 	vaesenc	%xmm15,%xmm14,%xmm14
 
 	vmovups	80-128(%rcx),%xmm15
 	vpxor	%xmm3,%xmm6,%xmm6
 	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm5,%xmm6,%xmm6
 	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
 	vaesenc	%xmm15,%xmm10,%xmm10
 	movq	56(%r14),%r13
 	bswapq	%r13
 	vpxor	%xmm1,%xmm7,%xmm7
 	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
 	vpxor	112+8(%rsp),%xmm8,%xmm8
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movq	48(%r14),%r12
 	bswapq	%r12
 	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r13,64+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	movq	%r12,72+8(%rsp)
 	vpxor	%xmm3,%xmm4,%xmm4
 	vmovdqu	112-32(%r9),%xmm3
 	vaesenc	%xmm15,%xmm14,%xmm14
 
 	vmovups	96-128(%rcx),%xmm15
 	vpxor	%xmm5,%xmm6,%xmm6
 	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm1,%xmm6,%xmm6
 	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
 	vaesenc	%xmm15,%xmm10,%xmm10
 	movq	40(%r14),%r13
 	bswapq	%r13
 	vpxor	%xmm2,%xmm7,%xmm7
 	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movq	32(%r14),%r12
 	bswapq	%r12
 	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r13,80+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	movq	%r12,88+8(%rsp)
 	vpxor	%xmm5,%xmm6,%xmm6
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vpxor	%xmm1,%xmm6,%xmm6
 
 	vmovups	112-128(%rcx),%xmm15
 	vpslldq	$8,%xmm6,%xmm5
 	vpxor	%xmm2,%xmm4,%xmm4
 	vmovdqu	16(%r11),%xmm3
 
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm8,%xmm7,%xmm7
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vpxor	%xmm5,%xmm4,%xmm4
 	movq	24(%r14),%r13
 	bswapq	%r13
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movq	16(%r14),%r12
 	bswapq	%r12
 	vpalignr	$8,%xmm4,%xmm4,%xmm0
 	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
 	movq	%r13,96+8(%rsp)
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r12,104+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vmovups	128-128(%rcx),%xmm1
 	vaesenc	%xmm15,%xmm14,%xmm14
 
 	vaesenc	%xmm1,%xmm9,%xmm9
 	vmovups	144-128(%rcx),%xmm15
 	vaesenc	%xmm1,%xmm10,%xmm10
 	vpsrldq	$8,%xmm6,%xmm6
 	vaesenc	%xmm1,%xmm11,%xmm11
 	vpxor	%xmm6,%xmm7,%xmm7
 	vaesenc	%xmm1,%xmm12,%xmm12
 	vpxor	%xmm0,%xmm4,%xmm4
 	movq	8(%r14),%r13
 	bswapq	%r13
 	vaesenc	%xmm1,%xmm13,%xmm13
 	movq	0(%r14),%r12
 	bswapq	%r12
 	vaesenc	%xmm1,%xmm14,%xmm14
 	vmovups	160-128(%rcx),%xmm1
 	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
 	jb	.Lenc_tail_nmb
 
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vaesenc	%xmm15,%xmm11,%xmm11
 	vaesenc	%xmm15,%xmm12,%xmm12
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vaesenc	%xmm15,%xmm14,%xmm14
 
 	vaesenc	%xmm1,%xmm9,%xmm9
 	vaesenc	%xmm1,%xmm10,%xmm10
 	vaesenc	%xmm1,%xmm11,%xmm11
 	vaesenc	%xmm1,%xmm12,%xmm12
 	vaesenc	%xmm1,%xmm13,%xmm13
 	vmovups	176-128(%rcx),%xmm15
 	vaesenc	%xmm1,%xmm14,%xmm14
 	vmovups	192-128(%rcx),%xmm1
 	cmpl	$14,%ebp	// ICP does not zero key schedule.
 	jb	.Lenc_tail_nmb
 
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vaesenc	%xmm15,%xmm11,%xmm11
 	vaesenc	%xmm15,%xmm12,%xmm12
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vaesenc	%xmm15,%xmm14,%xmm14
 
 	vaesenc	%xmm1,%xmm9,%xmm9
 	vaesenc	%xmm1,%xmm10,%xmm10
 	vaesenc	%xmm1,%xmm11,%xmm11
 	vaesenc	%xmm1,%xmm12,%xmm12
 	vaesenc	%xmm1,%xmm13,%xmm13
 	vmovups	208-128(%rcx),%xmm15
 	vaesenc	%xmm1,%xmm14,%xmm14
 	vmovups	224-128(%rcx),%xmm1
 	jmp	.Lenc_tail_nmb
 
-.align	32
+.balign	32
 .Lhandle_ctr32_nmb:
 	vmovdqu	(%r11),%xmm0
 	vpshufb	%xmm0,%xmm1,%xmm6
 	vmovdqu	48(%r11),%xmm5
 	vpaddd	64(%r11),%xmm6,%xmm10
 	vpaddd	%xmm5,%xmm6,%xmm11
 	vmovdqu	0-32(%r9),%xmm3
 	vpaddd	%xmm5,%xmm10,%xmm12
 	vpshufb	%xmm0,%xmm10,%xmm10
 	vpaddd	%xmm5,%xmm11,%xmm13
 	vpshufb	%xmm0,%xmm11,%xmm11
 	vpxor	%xmm15,%xmm10,%xmm10
 	vpaddd	%xmm5,%xmm12,%xmm14
 	vpshufb	%xmm0,%xmm12,%xmm12
 	vpxor	%xmm15,%xmm11,%xmm11
 	vpaddd	%xmm5,%xmm13,%xmm1
 	vpshufb	%xmm0,%xmm13,%xmm13
 	vpshufb	%xmm0,%xmm14,%xmm14
 	vpshufb	%xmm0,%xmm1,%xmm1
 	jmp	.Lresume_ctr32_nmb
 
-.align	32
+.balign	32
 .Lenc_tail_nmb:
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vmovdqu	%xmm7,16+8(%rsp)
 	vpalignr	$8,%xmm4,%xmm4,%xmm8
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
 	vpxor	0(%rdi),%xmm1,%xmm2
 	vaesenc	%xmm15,%xmm11,%xmm11
 	vpxor	16(%rdi),%xmm1,%xmm0
 	vaesenc	%xmm15,%xmm12,%xmm12
 	vpxor	32(%rdi),%xmm1,%xmm5
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vpxor	48(%rdi),%xmm1,%xmm6
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vpxor	64(%rdi),%xmm1,%xmm7
 	vpxor	80(%rdi),%xmm1,%xmm3
 	vmovdqu	(%r8),%xmm1
 
 	vaesenclast	%xmm2,%xmm9,%xmm9
 	vmovdqu	32(%r11),%xmm2
 	vaesenclast	%xmm0,%xmm10,%xmm10
 	vpaddb	%xmm2,%xmm1,%xmm0
 	movq	%r13,112+8(%rsp)
 	leaq	96(%rdi),%rdi
 	vaesenclast	%xmm5,%xmm11,%xmm11
 	vpaddb	%xmm2,%xmm0,%xmm5
 	movq	%r12,120+8(%rsp)
 	leaq	96(%rsi),%rsi
 	vmovdqu	0-128(%rcx),%xmm15
 	vaesenclast	%xmm6,%xmm12,%xmm12
 	vpaddb	%xmm2,%xmm5,%xmm6
 	vaesenclast	%xmm7,%xmm13,%xmm13
 	vpaddb	%xmm2,%xmm6,%xmm7
 	vaesenclast	%xmm3,%xmm14,%xmm14
 	vpaddb	%xmm2,%xmm7,%xmm3
 
 	addq	$0x60,%r10
 	subq	$0x6,%rdx
 	jc	.L6x_done_nmb
 
 	vmovups	%xmm9,-96(%rsi)
 	vpxor	%xmm15,%xmm1,%xmm9
 	vmovups	%xmm10,-80(%rsi)
 	vmovdqa	%xmm0,%xmm10
 	vmovups	%xmm11,-64(%rsi)
 	vmovdqa	%xmm5,%xmm11
 	vmovups	%xmm12,-48(%rsi)
 	vmovdqa	%xmm6,%xmm12
 	vmovups	%xmm13,-32(%rsi)
 	vmovdqa	%xmm7,%xmm13
 	vmovups	%xmm14,-16(%rsi)
 	vmovdqa	%xmm3,%xmm14
 	vmovdqu	32+8(%rsp),%xmm7
 	jmp	.Loop6x_nmb
 
 .L6x_done_nmb:
 	vpxor	16+8(%rsp),%xmm8,%xmm8
 	vpxor	%xmm4,%xmm8,%xmm8
 
 	RET
 .cfi_endproc
 SET_SIZE(_aesni_ctr32_ghash_no_movbe_6x)
 
 ENTRY_ALIGN(aesni_gcm_decrypt, 32)
 .cfi_startproc
 	ENDBR
 	xorq	%r10,%r10
 	cmpq	$0x60,%rdx
 	jb	.Lgcm_dec_abort
 
 	leaq	(%rsp),%rax
 .cfi_def_cfa_register	%rax
 	pushq	%rbx
 .cfi_offset	%rbx,-16
 	pushq	%rbp
 .cfi_offset	%rbp,-24
 	pushq	%r12
 .cfi_offset	%r12,-32
 	pushq	%r13
 .cfi_offset	%r13,-40
 	pushq	%r14
 .cfi_offset	%r14,-48
 	pushq	%r15
 .cfi_offset	%r15,-56
 	pushq	%r9
 .cfi_offset	%r9,-64
 	vzeroupper
 
 	vmovdqu	(%r8),%xmm1
 	addq	$-128,%rsp
 	movl	12(%r8),%ebx
 	leaq	.Lbswap_mask(%rip),%r11
 	leaq	-128(%rcx),%r14
 	movq	$0xf80,%r15
 	vmovdqu	(%r9),%xmm8
 	andq	$-128,%rsp
 	vmovdqu	(%r11),%xmm0
 	leaq	128(%rcx),%rcx
 	movq	32(%r9),%r9
 	leaq	32(%r9),%r9
 	movl	504-128(%rcx),%ebp	// ICP has a larger offset for rounds.
 	vpshufb	%xmm0,%xmm8,%xmm8
 
 	andq	%r15,%r14
 	andq	%rsp,%r15
 	subq	%r14,%r15
 	jc	.Ldec_no_key_aliasing
 	cmpq	$768,%r15
 	jnc	.Ldec_no_key_aliasing
 	subq	%r15,%rsp
 .Ldec_no_key_aliasing:
 
 	vmovdqu	80(%rdi),%xmm7
 	leaq	(%rdi),%r14
 	vmovdqu	64(%rdi),%xmm4
 	leaq	-192(%rdi,%rdx,1),%r15
 	vmovdqu	48(%rdi),%xmm5
 	shrq	$4,%rdx
 	xorq	%r10,%r10
 	vmovdqu	32(%rdi),%xmm6
 	vpshufb	%xmm0,%xmm7,%xmm7
 	vmovdqu	16(%rdi),%xmm2
 	vpshufb	%xmm0,%xmm4,%xmm4
 	vmovdqu	(%rdi),%xmm3
 	vpshufb	%xmm0,%xmm5,%xmm5
 	vmovdqu	%xmm4,48(%rsp)
 	vpshufb	%xmm0,%xmm6,%xmm6
 	vmovdqu	%xmm5,64(%rsp)
 	vpshufb	%xmm0,%xmm2,%xmm2
 	vmovdqu	%xmm6,80(%rsp)
 	vpshufb	%xmm0,%xmm3,%xmm3
 	vmovdqu	%xmm2,96(%rsp)
 	vmovdqu	%xmm3,112(%rsp)
 
 #ifdef HAVE_MOVBE
 #ifdef _KERNEL
 	testl	$1,gcm_avx_can_use_movbe(%rip)
 #else
 	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
 #endif
 	jz	1f
 	call	_aesni_ctr32_ghash_6x
 	jmp	2f
 1:
 #endif
 	call	_aesni_ctr32_ghash_no_movbe_6x
 2:
 	vmovups	%xmm9,-96(%rsi)
 	vmovups	%xmm10,-80(%rsi)
 	vmovups	%xmm11,-64(%rsi)
 	vmovups	%xmm12,-48(%rsi)
 	vmovups	%xmm13,-32(%rsi)
 	vmovups	%xmm14,-16(%rsi)
 
 	vpshufb	(%r11),%xmm8,%xmm8
 	movq	-56(%rax),%r9
 .cfi_restore	%r9
 	vmovdqu	%xmm8,(%r9)
 
 	vzeroupper
 	movq	-48(%rax),%r15
 .cfi_restore	%r15
 	movq	-40(%rax),%r14
 .cfi_restore	%r14
 	movq	-32(%rax),%r13
 .cfi_restore	%r13
 	movq	-24(%rax),%r12
 .cfi_restore	%r12
 	movq	-16(%rax),%rbp
 .cfi_restore	%rbp
 	movq	-8(%rax),%rbx
 .cfi_restore	%rbx
 	leaq	(%rax),%rsp
 .cfi_def_cfa_register	%rsp
 .Lgcm_dec_abort:
 	movq	%r10,%rax
 	RET
 .cfi_endproc
 SET_SIZE(aesni_gcm_decrypt)
 
-.align 32
+.balign 32
 FUNCTION(_aesni_ctr32_6x)
 .cfi_startproc
 	ENDBR
 	vmovdqu	0-128(%rcx),%xmm4
 	vmovdqu	32(%r11),%xmm2
 	leaq	-2(%rbp),%r13	// ICP uses 10,12,14 not 9,11,13 for rounds.
 	vmovups	16-128(%rcx),%xmm15
 	leaq	32-128(%rcx),%r12
 	vpxor	%xmm4,%xmm1,%xmm9
 	addl	$100663296,%ebx
 	jc	.Lhandle_ctr32_2
 	vpaddb	%xmm2,%xmm1,%xmm10
 	vpaddb	%xmm2,%xmm10,%xmm11
 	vpxor	%xmm4,%xmm10,%xmm10
 	vpaddb	%xmm2,%xmm11,%xmm12
 	vpxor	%xmm4,%xmm11,%xmm11
 	vpaddb	%xmm2,%xmm12,%xmm13
 	vpxor	%xmm4,%xmm12,%xmm12
 	vpaddb	%xmm2,%xmm13,%xmm14
 	vpxor	%xmm4,%xmm13,%xmm13
 	vpaddb	%xmm2,%xmm14,%xmm1
 	vpxor	%xmm4,%xmm14,%xmm14
 	jmp	.Loop_ctr32
 
-.align	16
+.balign	16
 .Loop_ctr32:
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vaesenc	%xmm15,%xmm11,%xmm11
 	vaesenc	%xmm15,%xmm12,%xmm12
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vmovups	(%r12),%xmm15
 	leaq	16(%r12),%r12
 	decl	%r13d
 	jnz	.Loop_ctr32
 
 	vmovdqu	(%r12),%xmm3
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	0(%rdi),%xmm3,%xmm4
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vpxor	16(%rdi),%xmm3,%xmm5
 	vaesenc	%xmm15,%xmm11,%xmm11
 	vpxor	32(%rdi),%xmm3,%xmm6
 	vaesenc	%xmm15,%xmm12,%xmm12
 	vpxor	48(%rdi),%xmm3,%xmm8
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vpxor	64(%rdi),%xmm3,%xmm2
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vpxor	80(%rdi),%xmm3,%xmm3
 	leaq	96(%rdi),%rdi
 
 	vaesenclast	%xmm4,%xmm9,%xmm9
 	vaesenclast	%xmm5,%xmm10,%xmm10
 	vaesenclast	%xmm6,%xmm11,%xmm11
 	vaesenclast	%xmm8,%xmm12,%xmm12
 	vaesenclast	%xmm2,%xmm13,%xmm13
 	vaesenclast	%xmm3,%xmm14,%xmm14
 	vmovups	%xmm9,0(%rsi)
 	vmovups	%xmm10,16(%rsi)
 	vmovups	%xmm11,32(%rsi)
 	vmovups	%xmm12,48(%rsi)
 	vmovups	%xmm13,64(%rsi)
 	vmovups	%xmm14,80(%rsi)
 	leaq	96(%rsi),%rsi
 
 	RET
-.align	32
+.balign	32
 .Lhandle_ctr32_2:
 	vpshufb	%xmm0,%xmm1,%xmm6
 	vmovdqu	48(%r11),%xmm5
 	vpaddd	64(%r11),%xmm6,%xmm10
 	vpaddd	%xmm5,%xmm6,%xmm11
 	vpaddd	%xmm5,%xmm10,%xmm12
 	vpshufb	%xmm0,%xmm10,%xmm10
 	vpaddd	%xmm5,%xmm11,%xmm13
 	vpshufb	%xmm0,%xmm11,%xmm11
 	vpxor	%xmm4,%xmm10,%xmm10
 	vpaddd	%xmm5,%xmm12,%xmm14
 	vpshufb	%xmm0,%xmm12,%xmm12
 	vpxor	%xmm4,%xmm11,%xmm11
 	vpaddd	%xmm5,%xmm13,%xmm1
 	vpshufb	%xmm0,%xmm13,%xmm13
 	vpxor	%xmm4,%xmm12,%xmm12
 	vpshufb	%xmm0,%xmm14,%xmm14
 	vpxor	%xmm4,%xmm13,%xmm13
 	vpshufb	%xmm0,%xmm1,%xmm1
 	vpxor	%xmm4,%xmm14,%xmm14
 	jmp	.Loop_ctr32
 .cfi_endproc
 SET_SIZE(_aesni_ctr32_6x)
 
 ENTRY_ALIGN(aesni_gcm_encrypt, 32)
 .cfi_startproc
 	ENDBR
 	xorq	%r10,%r10
 	cmpq	$288,%rdx
 	jb	.Lgcm_enc_abort
 
 	leaq	(%rsp),%rax
 .cfi_def_cfa_register	%rax
 	pushq	%rbx
 .cfi_offset	%rbx,-16
 	pushq	%rbp
 .cfi_offset	%rbp,-24
 	pushq	%r12
 .cfi_offset	%r12,-32
 	pushq	%r13
 .cfi_offset	%r13,-40
 	pushq	%r14
 .cfi_offset	%r14,-48
 	pushq	%r15
 .cfi_offset	%r15,-56
 	pushq	%r9
 .cfi_offset	%r9,-64
 	vzeroupper
 
 	vmovdqu	(%r8),%xmm1
 	addq	$-128,%rsp
 	movl	12(%r8),%ebx
 	leaq	.Lbswap_mask(%rip),%r11
 	leaq	-128(%rcx),%r14
 	movq	$0xf80,%r15
 	leaq	128(%rcx),%rcx
 	vmovdqu	(%r11),%xmm0
 	andq	$-128,%rsp
 	movl	504-128(%rcx),%ebp	// ICP has an larger offset for rounds.
 
 	andq	%r15,%r14
 	andq	%rsp,%r15
 	subq	%r14,%r15
 	jc	.Lenc_no_key_aliasing
 	cmpq	$768,%r15
 	jnc	.Lenc_no_key_aliasing
 	subq	%r15,%rsp
 .Lenc_no_key_aliasing:
 
 	leaq	(%rsi),%r14
 	leaq	-192(%rsi,%rdx,1),%r15
 	shrq	$4,%rdx
 
 	call	_aesni_ctr32_6x
 	vpshufb	%xmm0,%xmm9,%xmm8
 	vpshufb	%xmm0,%xmm10,%xmm2
 	vmovdqu	%xmm8,112(%rsp)
 	vpshufb	%xmm0,%xmm11,%xmm4
 	vmovdqu	%xmm2,96(%rsp)
 	vpshufb	%xmm0,%xmm12,%xmm5
 	vmovdqu	%xmm4,80(%rsp)
 	vpshufb	%xmm0,%xmm13,%xmm6
 	vmovdqu	%xmm5,64(%rsp)
 	vpshufb	%xmm0,%xmm14,%xmm7
 	vmovdqu	%xmm6,48(%rsp)
 
 	call	_aesni_ctr32_6x
 
 	vmovdqu	(%r9),%xmm8
 	movq	32(%r9),%r9
 	leaq	32(%r9),%r9
 	subq	$12,%rdx
 	movq	$192,%r10
 	vpshufb	%xmm0,%xmm8,%xmm8
 
 #ifdef HAVE_MOVBE
 #ifdef _KERNEL
 	testl	$1,gcm_avx_can_use_movbe(%rip)
 #else
 	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
 #endif
 	jz	1f
 	call	_aesni_ctr32_ghash_6x
 	jmp	2f
 1:
 #endif
 	call	_aesni_ctr32_ghash_no_movbe_6x
 2:
 	vmovdqu	32(%rsp),%xmm7
 	vmovdqu	(%r11),%xmm0
 	vmovdqu	0-32(%r9),%xmm3
 	vpunpckhqdq	%xmm7,%xmm7,%xmm1
 	vmovdqu	32-32(%r9),%xmm15
 	vmovups	%xmm9,-96(%rsi)
 	vpshufb	%xmm0,%xmm9,%xmm9
 	vpxor	%xmm7,%xmm1,%xmm1
 	vmovups	%xmm10,-80(%rsi)
 	vpshufb	%xmm0,%xmm10,%xmm10
 	vmovups	%xmm11,-64(%rsi)
 	vpshufb	%xmm0,%xmm11,%xmm11
 	vmovups	%xmm12,-48(%rsi)
 	vpshufb	%xmm0,%xmm12,%xmm12
 	vmovups	%xmm13,-32(%rsi)
 	vpshufb	%xmm0,%xmm13,%xmm13
 	vmovups	%xmm14,-16(%rsi)
 	vpshufb	%xmm0,%xmm14,%xmm14
 	vmovdqu	%xmm9,16(%rsp)
 	vmovdqu	48(%rsp),%xmm6
 	vmovdqu	16-32(%r9),%xmm0
 	vpunpckhqdq	%xmm6,%xmm6,%xmm2
 	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
 	vpxor	%xmm6,%xmm2,%xmm2
 	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
 	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
 
 	vmovdqu	64(%rsp),%xmm9
 	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
 	vmovdqu	48-32(%r9),%xmm3
 	vpxor	%xmm5,%xmm4,%xmm4
 	vpunpckhqdq	%xmm9,%xmm9,%xmm5
 	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
 	vpxor	%xmm9,%xmm5,%xmm5
 	vpxor	%xmm7,%xmm6,%xmm6
 	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
 	vmovdqu	80-32(%r9),%xmm15
 	vpxor	%xmm1,%xmm2,%xmm2
 
 	vmovdqu	80(%rsp),%xmm1
 	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
 	vmovdqu	64-32(%r9),%xmm0
 	vpxor	%xmm4,%xmm7,%xmm7
 	vpunpckhqdq	%xmm1,%xmm1,%xmm4
 	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpxor	%xmm6,%xmm9,%xmm9
 	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
 	vpxor	%xmm2,%xmm5,%xmm5
 
 	vmovdqu	96(%rsp),%xmm2
 	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
 	vmovdqu	96-32(%r9),%xmm3
 	vpxor	%xmm7,%xmm6,%xmm6
 	vpunpckhqdq	%xmm2,%xmm2,%xmm7
 	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
 	vpxor	%xmm2,%xmm7,%xmm7
 	vpxor	%xmm9,%xmm1,%xmm1
 	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
 	vmovdqu	128-32(%r9),%xmm15
 	vpxor	%xmm5,%xmm4,%xmm4
 
 	vpxor	112(%rsp),%xmm8,%xmm8
 	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
 	vmovdqu	112-32(%r9),%xmm0
 	vpunpckhqdq	%xmm8,%xmm8,%xmm9
 	vpxor	%xmm6,%xmm5,%xmm5
 	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
 	vpxor	%xmm8,%xmm9,%xmm9
 	vpxor	%xmm1,%xmm2,%xmm2
 	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
 	vpxor	%xmm4,%xmm7,%xmm4
 
 	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
 	vmovdqu	0-32(%r9),%xmm3
 	vpunpckhqdq	%xmm14,%xmm14,%xmm1
 	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
 	vpxor	%xmm14,%xmm1,%xmm1
 	vpxor	%xmm5,%xmm6,%xmm5
 	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
 	vmovdqu	32-32(%r9),%xmm15
 	vpxor	%xmm2,%xmm8,%xmm7
 	vpxor	%xmm4,%xmm9,%xmm6
 
 	vmovdqu	16-32(%r9),%xmm0
 	vpxor	%xmm5,%xmm7,%xmm9
 	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
 	vpxor	%xmm9,%xmm6,%xmm6
 	vpunpckhqdq	%xmm13,%xmm13,%xmm2
 	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
 	vpxor	%xmm13,%xmm2,%xmm2
 	vpslldq	$8,%xmm6,%xmm9
 	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
 	vpxor	%xmm9,%xmm5,%xmm8
 	vpsrldq	$8,%xmm6,%xmm6
 	vpxor	%xmm6,%xmm7,%xmm7
 
 	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
 	vmovdqu	48-32(%r9),%xmm3
 	vpxor	%xmm4,%xmm5,%xmm5
 	vpunpckhqdq	%xmm12,%xmm12,%xmm9
 	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
 	vpxor	%xmm12,%xmm9,%xmm9
 	vpxor	%xmm14,%xmm13,%xmm13
 	vpalignr	$8,%xmm8,%xmm8,%xmm14
 	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
 	vmovdqu	80-32(%r9),%xmm15
 	vpxor	%xmm1,%xmm2,%xmm2
 
 	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
 	vmovdqu	64-32(%r9),%xmm0
 	vpxor	%xmm5,%xmm4,%xmm4
 	vpunpckhqdq	%xmm11,%xmm11,%xmm1
 	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
 	vpxor	%xmm11,%xmm1,%xmm1
 	vpxor	%xmm13,%xmm12,%xmm12
 	vxorps	16(%rsp),%xmm7,%xmm7
 	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
 	vpxor	%xmm2,%xmm9,%xmm9
 
 	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
 	vxorps	%xmm14,%xmm8,%xmm8
 
 	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
 	vmovdqu	96-32(%r9),%xmm3
 	vpxor	%xmm4,%xmm5,%xmm5
 	vpunpckhqdq	%xmm10,%xmm10,%xmm2
 	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
 	vpxor	%xmm10,%xmm2,%xmm2
 	vpalignr	$8,%xmm8,%xmm8,%xmm14
 	vpxor	%xmm12,%xmm11,%xmm11
 	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
 	vmovdqu	128-32(%r9),%xmm15
 	vpxor	%xmm9,%xmm1,%xmm1
 
 	vxorps	%xmm7,%xmm14,%xmm14
 	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
 	vxorps	%xmm14,%xmm8,%xmm8
 
 	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
 	vmovdqu	112-32(%r9),%xmm0
 	vpxor	%xmm5,%xmm4,%xmm4
 	vpunpckhqdq	%xmm8,%xmm8,%xmm9
 	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
 	vpxor	%xmm8,%xmm9,%xmm9
 	vpxor	%xmm11,%xmm10,%xmm10
 	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
 	vpxor	%xmm1,%xmm2,%xmm2
 
 	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
 	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
 	vpxor	%xmm4,%xmm5,%xmm5
 	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
 	vpxor	%xmm10,%xmm7,%xmm7
 	vpxor	%xmm2,%xmm6,%xmm6
 
 	vpxor	%xmm5,%xmm7,%xmm4
 	vpxor	%xmm4,%xmm6,%xmm6
 	vpslldq	$8,%xmm6,%xmm1
 	vmovdqu	16(%r11),%xmm3
 	vpsrldq	$8,%xmm6,%xmm6
 	vpxor	%xmm1,%xmm5,%xmm8
 	vpxor	%xmm6,%xmm7,%xmm7
 
 	vpalignr	$8,%xmm8,%xmm8,%xmm2
 	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
 	vpxor	%xmm2,%xmm8,%xmm8
 
 	vpalignr	$8,%xmm8,%xmm8,%xmm2
 	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
 	vpxor	%xmm7,%xmm2,%xmm2
 	vpxor	%xmm2,%xmm8,%xmm8
 	vpshufb	(%r11),%xmm8,%xmm8
 	movq	-56(%rax),%r9
 .cfi_restore	%r9
 	vmovdqu	%xmm8,(%r9)
 
 	vzeroupper
 	movq	-48(%rax),%r15
 .cfi_restore	%r15
 	movq	-40(%rax),%r14
 .cfi_restore	%r14
 	movq	-32(%rax),%r13
 .cfi_restore	%r13
 	movq	-24(%rax),%r12
 .cfi_restore	%r12
 	movq	-16(%rax),%rbp
 .cfi_restore	%rbp
 	movq	-8(%rax),%rbx
 .cfi_restore	%rbx
 	leaq	(%rax),%rsp
 .cfi_def_cfa_register	%rsp
 .Lgcm_enc_abort:
 	movq	%r10,%rax
 	RET
 .cfi_endproc
 SET_SIZE(aesni_gcm_encrypt)
 
 #endif /* !_WIN32 || _KERNEL */
 
 /* Some utility routines */
 
 /*
  * clear all fpu registers
  * void clear_fpu_regs_avx(void);
  */
 ENTRY_ALIGN(clear_fpu_regs_avx, 32)
 	vzeroall
 	RET
 SET_SIZE(clear_fpu_regs_avx)
 
 /*
  * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
  *
  * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
  * stores the result at `dst'. The XOR is performed using FPU registers,
  * so make sure FPU state is saved when running this in the kernel.
  */
 ENTRY_ALIGN(gcm_xor_avx, 32)
 	movdqu  (%rdi), %xmm0
 	movdqu  (%rsi), %xmm1
 	pxor    %xmm1, %xmm0
 	movdqu  %xmm0, (%rsi)
 	RET
 SET_SIZE(gcm_xor_avx)
 
 /*
  * Toggle a boolean_t value atomically and return the new value.
  * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
  */
 ENTRY_ALIGN(atomic_toggle_boolean_nv, 32)
 	xorl	%eax, %eax
 	lock
 	xorl	$1, (%rdi)
 	jz	1f
 	movl	$1, %eax
 1:
 	RET
 SET_SIZE(atomic_toggle_boolean_nv)
 
 SECTION_STATIC
 
-.align	64
+.balign	64
 .Lbswap_mask:
 .byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 .Lpoly:
 .byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
 .Lone_msb:
 .byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 .Ltwo_lsb:
 .byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 .Lone_lsb:
 .byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 .byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	64
+.balign	64
 
 /* Mark the stack non-executable. */
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
 
 #endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
diff --git a/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S
index eb9514e10cda..e40b3df32753 100644
--- a/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S
+++ b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S
@@ -1,254 +1,254 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2009 Intel Corporation
  * All Rights Reserved.
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
  * instructions.  This file contains an accelerated
  * Galois Field Multiplication implementation.
  *
  * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
  * carry-less multiplication. More information about PCLMULQDQ can be
  * found at:
  * http://software.intel.com/en-us/articles/
  * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
  *
  */
 
 /*
  * ====================================================================
  * OpenSolaris OS modifications
  *
  * This source originates as file galois_hash_asm.c from
  * Intel Corporation dated September 21, 2009.
  *
  * This OpenSolaris version has these major changes from the original source:
  *
  * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
  * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
  * definition for lint.
  *
  * 2. Formatted code, added comments, and added #includes and #defines.
  *
  * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
  * calling kpreempt_disable() and kpreempt_enable().
  * If the TS bit is not set, Save and restore %xmm registers at the beginning
  * and end of function calls (%xmm* registers are not saved and restored by
  * during kernel thread preemption).
  *
  * 4. Removed code to perform hashing.  This is already done with C macro
  * GHASH in gcm.c.  For better performance, this removed code should be
  * reintegrated in the future to replace the C GHASH macro.
  *
  * 5. Added code to byte swap 16-byte input and output.
  *
  * 6. Folded in comments from the original C source with embedded assembly
  * (SB_w_shift_xor.c)
  *
  * 7. Renamed function and reordered parameters to match OpenSolaris:
  * Intel interface:
  *	void galois_hash_asm(unsigned char *hk, unsigned char *s,
  *		unsigned char *d, int length)
  * OpenSolaris OS interface:
  *	void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
  * ====================================================================
  */
 
 
 #if defined(lint) || defined(__lint)	/* lint */
 
 #include <sys/types.h>
 
 void
 gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
 	(void) x_in, (void) y, (void) res;
 }
 
 #elif defined(HAVE_PCLMULQDQ)	/* guard by instruction set */
 
 #define _ASM
 #include <sys/asm_linkage.h>
 
 /*
  * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
  */
 
 // static uint8_t byte_swap16_mask[] = {
 //	 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
 .section .rodata
-.align XMM_ALIGN
+.balign XMM_ALIGN
 .Lbyte_swap16_mask:
 	.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 
 /*
  * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
  *
  * Perform a carry-less multiplication (that is, use XOR instead of the
  * multiply operator) on P1 and P2 and place the result in P3.
  *
  * Byte swap the input and the output.
  *
  * Note: x_in, y, and res all point to a block of 20-byte numbers
  * (an array of two 64-bit integers).
  *
  * Note2: For kernel code, caller is responsible for ensuring
  * kpreempt_disable() has been called.  This is because %xmm registers are
  * not saved/restored.  Clear and set the CR0.TS bit on entry and exit,
  * respectively, if TS is set on entry.  Otherwise, if TS is not set,
  * save and restore %xmm registers on the stack.
  *
  * Note3: Original Intel definition:
  * void galois_hash_asm(unsigned char *hk, unsigned char *s,
  *	unsigned char *d, int length)
  *
  * Note4: Register/parameter mapping:
  * Intel:
  *	Parameter 1: %rcx (copied to %xmm0)	hk or x_in
  *	Parameter 2: %rdx (copied to %xmm1)	s or y
  *	Parameter 3: %rdi (result)		d or res
  * OpenSolaris:
  *	Parameter 1: %rdi (copied to %xmm0)	x_in
  *	Parameter 2: %rsi (copied to %xmm1)	y
  *	Parameter 3: %rdx (result)		res
  */
 
 ENTRY_NP(gcm_mul_pclmulqdq)
 	//
 	// Copy Parameters
 	//
 	movdqu	(%rdi), %xmm0	// P1
 	movdqu	(%rsi), %xmm1	// P2
 
 	//
 	// Byte swap 16-byte input
 	//
 	lea	.Lbyte_swap16_mask(%rip), %rax
 	movups	(%rax), %xmm10
 	pshufb	%xmm10, %xmm0
 	pshufb	%xmm10, %xmm1
 
 
 	//
 	// Multiply with the hash key
 	//
 	movdqu	%xmm0, %xmm3
 	pclmulqdq $0, %xmm1, %xmm3	// xmm3 holds a0*b0
 
 	movdqu	%xmm0, %xmm4
 	pclmulqdq $16, %xmm1, %xmm4	// xmm4 holds a0*b1
 
 	movdqu	%xmm0, %xmm5
 	pclmulqdq $1, %xmm1, %xmm5	// xmm5 holds a1*b0
 	movdqu	%xmm0, %xmm6
 	pclmulqdq $17, %xmm1, %xmm6	// xmm6 holds a1*b1
 
 	pxor	%xmm5, %xmm4	// xmm4 holds a0*b1 + a1*b0
 
 	movdqu	%xmm4, %xmm5	// move the contents of xmm4 to xmm5
 	psrldq	$8, %xmm4	// shift by xmm4 64 bits to the right
 	pslldq	$8, %xmm5	// shift by xmm5 64 bits to the left
 	pxor	%xmm5, %xmm3
 	pxor	%xmm4, %xmm6	// Register pair <xmm6:xmm3> holds the result
 				// of the carry-less multiplication of
 				// xmm0 by xmm1.
 
 	// We shift the result of the multiplication by one bit position
 	// to the left to cope for the fact that the bits are reversed.
 	movdqu	%xmm3, %xmm7
 	movdqu	%xmm6, %xmm8
 	pslld	$1, %xmm3
 	pslld	$1, %xmm6
 	psrld	$31, %xmm7
 	psrld	$31, %xmm8
 	movdqu	%xmm7, %xmm9
 	pslldq	$4, %xmm8
 	pslldq	$4, %xmm7
 	psrldq	$12, %xmm9
 	por	%xmm7, %xmm3
 	por	%xmm8, %xmm6
 	por	%xmm9, %xmm6
 
 	//
 	// First phase of the reduction
 	//
 	// Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
 	// independently.
 	movdqu	%xmm3, %xmm7
 	movdqu	%xmm3, %xmm8
 	movdqu	%xmm3, %xmm9
 	pslld	$31, %xmm7	// packed right shift shifting << 31
 	pslld	$30, %xmm8	// packed right shift shifting << 30
 	pslld	$25, %xmm9	// packed right shift shifting << 25
 	pxor	%xmm8, %xmm7	// xor the shifted versions
 	pxor	%xmm9, %xmm7
 	movdqu	%xmm7, %xmm8
 	pslldq	$12, %xmm7
 	psrldq	$4, %xmm8
 	pxor	%xmm7, %xmm3	// first phase of the reduction complete
 
 	//
 	// Second phase of the reduction
 	//
 	// Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
 	// shift operations.
 	movdqu	%xmm3, %xmm2
 	movdqu	%xmm3, %xmm4	// packed left shifting >> 1
 	movdqu	%xmm3, %xmm5
 	psrld	$1, %xmm2
 	psrld	$2, %xmm4	// packed left shifting >> 2
 	psrld	$7, %xmm5	// packed left shifting >> 7
 	pxor	%xmm4, %xmm2	// xor the shifted versions
 	pxor	%xmm5, %xmm2
 	pxor	%xmm8, %xmm2
 	pxor	%xmm2, %xmm3
 	pxor	%xmm3, %xmm6	// the result is in xmm6
 
 	//
 	// Byte swap 16-byte result
 	//
 	pshufb	%xmm10, %xmm6	// %xmm10 has the swap mask
 
 	//
 	// Store the result
 	//
 	movdqu	%xmm6, (%rdx)	// P3
 
 
 	//
 	// Return
 	//
 	RET
 	SET_SIZE(gcm_mul_pclmulqdq)
 
 #endif	/* lint || __lint */
 
 #ifdef __ELF__
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/module/icp/asm-x86_64/modes/ghash-x86_64.S b/module/icp/asm-x86_64/modes/ghash-x86_64.S
index d48b4f2155cc..f62e056d4b64 100644
--- a/module/icp/asm-x86_64/modes/ghash-x86_64.S
+++ b/module/icp/asm-x86_64/modes/ghash-x86_64.S
@@ -1,720 +1,720 @@
 # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the Apache License 2.0 (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
 # in the file LICENSE in the source distribution or at
 # https://www.openssl.org/source/license.html
 
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # March, June 2010
 #
 # The module implements "4-bit" GCM GHASH function and underlying
 # single multiplication operation in GF(2^128). "4-bit" means that
 # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
 # function features so called "528B" variant utilizing additional
 # 256+16 bytes of per-key storage [+512 bytes shared table].
 # Performance results are for this streamed GHASH subroutine and are
 # expressed in cycles per processed byte, less is better:
 #
 #		gcc 3.4.x(*)	assembler
 #
 # P4		28.6		14.0		+100%
 # Opteron	19.3		7.7		+150%
 # Core2		17.8		8.1(**)		+120%
 # Atom		31.6		16.8		+88%
 # VIA Nano	21.8		10.1		+115%
 #
 # (*)	comparison is not completely fair, because C results are
 #	for vanilla "256B" implementation, while assembler results
 #	are for "528B";-)
 # (**)	it's mystery [to me] why Core2 result is not same as for
 #	Opteron;
 
 # May 2010
 #
 # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
 # See ghash-x86.pl for background information and details about coding
 # techniques.
 #
 # Special thanks to David Woodhouse for providing access to a
 # Westmere-based system on behalf of Intel Open Source Technology Centre.
 
 # December 2012
 #
 # Overhaul: aggregate Karatsuba post-processing, improve ILP in
 # reduction_alg9, increase reduction aggregate factor to 4x. As for
 # the latter. ghash-x86.pl discusses that it makes lesser sense to
 # increase aggregate factor. Then why increase here? Critical path
 # consists of 3 independent pclmulqdq instructions, Karatsuba post-
 # processing and reduction. "On top" of this we lay down aggregated
 # multiplication operations, triplets of independent pclmulqdq's. As
 # issue rate for pclmulqdq is limited, it makes lesser sense to
 # aggregate more multiplications than it takes to perform remaining
 # non-multiplication operations. 2x is near-optimal coefficient for
 # contemporary Intel CPUs (therefore modest improvement coefficient),
 # but not for Bulldozer. Latter is because logical SIMD operations
 # are twice as slow in comparison to Intel, so that critical path is
 # longer. A CPU with higher pclmulqdq issue rate would also benefit
 # from higher aggregate factor...
 #
 # Westmere	1.78(+13%)
 # Sandy Bridge	1.80(+8%)
 # Ivy Bridge	1.80(+7%)
 # Haswell	0.55(+93%) (if system doesn't support AVX)
 # Broadwell	0.45(+110%)(if system doesn't support AVX)
 # Skylake	0.44(+110%)(if system doesn't support AVX)
 # Bulldozer	1.49(+27%)
 # Silvermont	2.88(+13%)
 # Knights L	2.12(-)    (if system doesn't support AVX)
 # Goldmont	1.08(+24%)
 
 # March 2013
 #
 # ... 8x aggregate factor AVX code path is using reduction algorithm
 # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
 # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
 # sub-optimally in comparison to above mentioned version. But thanks
 # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
 # it performs in 0.41 cycles per byte on Haswell processor, in
 # 0.29 on Broadwell, and in 0.36 on Skylake.
 #
 # Knights Landing achieves 1.09 cpb.
 #
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 
 # Generated once from
 # https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl
 # and modified for ICP. Modification are kept at a bare minimum to ease later
 # upstream merges.
 
 #if defined(__x86_64__) && defined(HAVE_AVX) && \
     defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
 
 #define _ASM
 #include <sys/asm_linkage.h>
 
 .text
 
 /* Windows userland links with OpenSSL */
 #if !defined (_WIN32) || defined (_KERNEL)
 ENTRY_ALIGN(gcm_gmult_clmul, 16)
 
 .cfi_startproc
 	ENDBR
 
 .L_gmult_clmul:
 	movdqu	(%rdi),%xmm0
 	movdqa	.Lbswap_mask(%rip),%xmm5
 	movdqu	(%rsi),%xmm2
 	movdqu	32(%rsi),%xmm4
 .byte	102,15,56,0,197
 	movdqa	%xmm0,%xmm1
 	pshufd	$78,%xmm0,%xmm3
 	pxor	%xmm0,%xmm3
 .byte	102,15,58,68,194,0
 .byte	102,15,58,68,202,17
 .byte	102,15,58,68,220,0
 	pxor	%xmm0,%xmm3
 	pxor	%xmm1,%xmm3
 
 	movdqa	%xmm3,%xmm4
 	psrldq	$8,%xmm3
 	pslldq	$8,%xmm4
 	pxor	%xmm3,%xmm1
 	pxor	%xmm4,%xmm0
 
 	movdqa	%xmm0,%xmm4
 	movdqa	%xmm0,%xmm3
 	psllq	$5,%xmm0
 	pxor	%xmm0,%xmm3
 	psllq	$1,%xmm0
 	pxor	%xmm3,%xmm0
 	psllq	$57,%xmm0
 	movdqa	%xmm0,%xmm3
 	pslldq	$8,%xmm0
 	psrldq	$8,%xmm3
 	pxor	%xmm4,%xmm0
 	pxor	%xmm3,%xmm1
 
 
 	movdqa	%xmm0,%xmm4
 	psrlq	$1,%xmm0
 	pxor	%xmm4,%xmm1
 	pxor	%xmm0,%xmm4
 	psrlq	$5,%xmm0
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
 .byte	102,15,56,0,197
 	movdqu	%xmm0,(%rdi)
 	RET
 .cfi_endproc
 SET_SIZE(gcm_gmult_clmul)
 #endif /* !_WIN32 || _KERNEL */
 
 ENTRY_ALIGN(gcm_init_htab_avx, 32)
 .cfi_startproc
 	ENDBR
 	vzeroupper
 
 	vmovdqu	(%rsi),%xmm2
 	// KCF/ICP stores H in network byte order with the hi qword first
 	// so we need to swap all bytes, not the 2 qwords.
 	vmovdqu	.Lbswap_mask(%rip),%xmm4
 	vpshufb	%xmm4,%xmm2,%xmm2
 
 
 	vpshufd	$255,%xmm2,%xmm4
 	vpsrlq	$63,%xmm2,%xmm3
 	vpsllq	$1,%xmm2,%xmm2
 	vpxor	%xmm5,%xmm5,%xmm5
 	vpcmpgtd	%xmm4,%xmm5,%xmm5
 	vpslldq	$8,%xmm3,%xmm3
 	vpor	%xmm3,%xmm2,%xmm2
 
 
 	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
 	vpxor	%xmm5,%xmm2,%xmm2
 
 	vpunpckhqdq	%xmm2,%xmm2,%xmm6
 	vmovdqa	%xmm2,%xmm0
 	vpxor	%xmm2,%xmm6,%xmm6
 	movq	$4,%r10
 	jmp	.Linit_start_avx
-.align	32
+.balign	32
 .Linit_loop_avx:
 	vpalignr	$8,%xmm3,%xmm4,%xmm5
 	vmovdqu	%xmm5,-16(%rdi)
 	vpunpckhqdq	%xmm0,%xmm0,%xmm3
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
 	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
 	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
 	vpxor	%xmm0,%xmm1,%xmm4
 	vpxor	%xmm4,%xmm3,%xmm3
 
 	vpslldq	$8,%xmm3,%xmm4
 	vpsrldq	$8,%xmm3,%xmm3
 	vpxor	%xmm4,%xmm0,%xmm0
 	vpxor	%xmm3,%xmm1,%xmm1
 	vpsllq	$57,%xmm0,%xmm3
 	vpsllq	$62,%xmm0,%xmm4
 	vpxor	%xmm3,%xmm4,%xmm4
 	vpsllq	$63,%xmm0,%xmm3
 	vpxor	%xmm3,%xmm4,%xmm4
 	vpslldq	$8,%xmm4,%xmm3
 	vpsrldq	$8,%xmm4,%xmm4
 	vpxor	%xmm3,%xmm0,%xmm0
 	vpxor	%xmm4,%xmm1,%xmm1
 
 	vpsrlq	$1,%xmm0,%xmm4
 	vpxor	%xmm0,%xmm1,%xmm1
 	vpxor	%xmm4,%xmm0,%xmm0
 	vpsrlq	$5,%xmm4,%xmm4
 	vpxor	%xmm4,%xmm0,%xmm0
 	vpsrlq	$1,%xmm0,%xmm0
 	vpxor	%xmm1,%xmm0,%xmm0
 .Linit_start_avx:
 	vmovdqa	%xmm0,%xmm5
 	vpunpckhqdq	%xmm0,%xmm0,%xmm3
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
 	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
 	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
 	vpxor	%xmm0,%xmm1,%xmm4
 	vpxor	%xmm4,%xmm3,%xmm3
 
 	vpslldq	$8,%xmm3,%xmm4
 	vpsrldq	$8,%xmm3,%xmm3
 	vpxor	%xmm4,%xmm0,%xmm0
 	vpxor	%xmm3,%xmm1,%xmm1
 	vpsllq	$57,%xmm0,%xmm3
 	vpsllq	$62,%xmm0,%xmm4
 	vpxor	%xmm3,%xmm4,%xmm4
 	vpsllq	$63,%xmm0,%xmm3
 	vpxor	%xmm3,%xmm4,%xmm4
 	vpslldq	$8,%xmm4,%xmm3
 	vpsrldq	$8,%xmm4,%xmm4
 	vpxor	%xmm3,%xmm0,%xmm0
 	vpxor	%xmm4,%xmm1,%xmm1
 
 	vpsrlq	$1,%xmm0,%xmm4
 	vpxor	%xmm0,%xmm1,%xmm1
 	vpxor	%xmm4,%xmm0,%xmm0
 	vpsrlq	$5,%xmm4,%xmm4
 	vpxor	%xmm4,%xmm0,%xmm0
 	vpsrlq	$1,%xmm0,%xmm0
 	vpxor	%xmm1,%xmm0,%xmm0
 	vpshufd	$78,%xmm5,%xmm3
 	vpshufd	$78,%xmm0,%xmm4
 	vpxor	%xmm5,%xmm3,%xmm3
 	vmovdqu	%xmm5,0(%rdi)
 	vpxor	%xmm0,%xmm4,%xmm4
 	vmovdqu	%xmm0,16(%rdi)
 	leaq	48(%rdi),%rdi
 	subq	$1,%r10
 	jnz	.Linit_loop_avx
 
 	vpalignr	$8,%xmm4,%xmm3,%xmm5
 	vmovdqu	%xmm5,-16(%rdi)
 
 	vzeroupper
 	RET
 .cfi_endproc
 SET_SIZE(gcm_init_htab_avx)
 
 #if !defined (_WIN32) || defined (_KERNEL)
 ENTRY_ALIGN(gcm_gmult_avx, 32)
 .cfi_startproc
 	ENDBR
 	jmp	.L_gmult_clmul
 .cfi_endproc
 SET_SIZE(gcm_gmult_avx)
 
 ENTRY_ALIGN(gcm_ghash_avx, 32)
 .cfi_startproc
 	ENDBR
 	vzeroupper
 
 	vmovdqu	(%rdi),%xmm10
 	leaq	.L0x1c2_polynomial(%rip),%r10
 	leaq	64(%rsi),%rsi
 	vmovdqu	.Lbswap_mask(%rip),%xmm13
 	vpshufb	%xmm13,%xmm10,%xmm10
 	cmpq	$0x80,%rcx
 	jb	.Lshort_avx
 	subq	$0x80,%rcx
 
 	vmovdqu	112(%rdx),%xmm14
 	vmovdqu	0-64(%rsi),%xmm6
 	vpshufb	%xmm13,%xmm14,%xmm14
 	vmovdqu	32-64(%rsi),%xmm7
 
 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
 	vmovdqu	96(%rdx),%xmm15
 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
 	vpxor	%xmm14,%xmm9,%xmm9
 	vpshufb	%xmm13,%xmm15,%xmm15
 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
 	vmovdqu	16-64(%rsi),%xmm6
 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
 	vmovdqu	80(%rdx),%xmm14
 	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
 	vpxor	%xmm15,%xmm8,%xmm8
 
 	vpshufb	%xmm13,%xmm14,%xmm14
 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
 	vmovdqu	48-64(%rsi),%xmm6
 	vpxor	%xmm14,%xmm9,%xmm9
 	vmovdqu	64(%rdx),%xmm15
 	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
 	vmovdqu	80-64(%rsi),%xmm7
 
 	vpshufb	%xmm13,%xmm15,%xmm15
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
 	vmovdqu	64-64(%rsi),%xmm6
 	vpxor	%xmm2,%xmm5,%xmm5
 	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
 	vpxor	%xmm15,%xmm8,%xmm8
 
 	vmovdqu	48(%rdx),%xmm14
 	vpxor	%xmm3,%xmm0,%xmm0
 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
 	vpxor	%xmm4,%xmm1,%xmm1
 	vpshufb	%xmm13,%xmm14,%xmm14
 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
 	vmovdqu	96-64(%rsi),%xmm6
 	vpxor	%xmm5,%xmm2,%xmm2
 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
 	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
 	vmovdqu	128-64(%rsi),%xmm7
 	vpxor	%xmm14,%xmm9,%xmm9
 
 	vmovdqu	32(%rdx),%xmm15
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpshufb	%xmm13,%xmm15,%xmm15
 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
 	vmovdqu	112-64(%rsi),%xmm6
 	vpxor	%xmm2,%xmm5,%xmm5
 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
 	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
 	vpxor	%xmm15,%xmm8,%xmm8
 
 	vmovdqu	16(%rdx),%xmm14
 	vpxor	%xmm3,%xmm0,%xmm0
 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
 	vpxor	%xmm4,%xmm1,%xmm1
 	vpshufb	%xmm13,%xmm14,%xmm14
 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
 	vmovdqu	144-64(%rsi),%xmm6
 	vpxor	%xmm5,%xmm2,%xmm2
 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
 	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
 	vmovdqu	176-64(%rsi),%xmm7
 	vpxor	%xmm14,%xmm9,%xmm9
 
 	vmovdqu	(%rdx),%xmm15
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpshufb	%xmm13,%xmm15,%xmm15
 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
 	vmovdqu	160-64(%rsi),%xmm6
 	vpxor	%xmm2,%xmm5,%xmm5
 	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
 
 	leaq	128(%rdx),%rdx
 	cmpq	$0x80,%rcx
 	jb	.Ltail_avx
 
 	vpxor	%xmm10,%xmm15,%xmm15
 	subq	$0x80,%rcx
 	jmp	.Loop8x_avx
 
-.align	32
+.balign	32
 .Loop8x_avx:
 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
 	vmovdqu	112(%rdx),%xmm14
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpxor	%xmm15,%xmm8,%xmm8
 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
 	vpshufb	%xmm13,%xmm14,%xmm14
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
 	vmovdqu	0-64(%rsi),%xmm6
 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
 	vpxor	%xmm2,%xmm5,%xmm5
 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
 	vmovdqu	32-64(%rsi),%xmm7
 	vpxor	%xmm14,%xmm9,%xmm9
 
 	vmovdqu	96(%rdx),%xmm15
 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
 	vpxor	%xmm3,%xmm10,%xmm10
 	vpshufb	%xmm13,%xmm15,%xmm15
 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
 	vxorps	%xmm4,%xmm11,%xmm11
 	vmovdqu	16-64(%rsi),%xmm6
 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
 	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
 	vpxor	%xmm5,%xmm12,%xmm12
 	vxorps	%xmm15,%xmm8,%xmm8
 
 	vmovdqu	80(%rdx),%xmm14
 	vpxor	%xmm10,%xmm12,%xmm12
 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
 	vpxor	%xmm11,%xmm12,%xmm12
 	vpslldq	$8,%xmm12,%xmm9
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
 	vpsrldq	$8,%xmm12,%xmm12
 	vpxor	%xmm9,%xmm10,%xmm10
 	vmovdqu	48-64(%rsi),%xmm6
 	vpshufb	%xmm13,%xmm14,%xmm14
 	vxorps	%xmm12,%xmm11,%xmm11
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
 	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
 	vmovdqu	80-64(%rsi),%xmm7
 	vpxor	%xmm14,%xmm9,%xmm9
 	vpxor	%xmm2,%xmm5,%xmm5
 
 	vmovdqu	64(%rdx),%xmm15
 	vpalignr	$8,%xmm10,%xmm10,%xmm12
 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
 	vpshufb	%xmm13,%xmm15,%xmm15
 	vpxor	%xmm3,%xmm0,%xmm0
 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
 	vmovdqu	64-64(%rsi),%xmm6
 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
 	vpxor	%xmm4,%xmm1,%xmm1
 	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
 	vxorps	%xmm15,%xmm8,%xmm8
 	vpxor	%xmm5,%xmm2,%xmm2
 
 	vmovdqu	48(%rdx),%xmm14
 	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
 	vpshufb	%xmm13,%xmm14,%xmm14
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
 	vmovdqu	96-64(%rsi),%xmm6
 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
 	vmovdqu	128-64(%rsi),%xmm7
 	vpxor	%xmm14,%xmm9,%xmm9
 	vpxor	%xmm2,%xmm5,%xmm5
 
 	vmovdqu	32(%rdx),%xmm15
 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
 	vpshufb	%xmm13,%xmm15,%xmm15
 	vpxor	%xmm3,%xmm0,%xmm0
 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
 	vmovdqu	112-64(%rsi),%xmm6
 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
 	vpxor	%xmm4,%xmm1,%xmm1
 	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
 	vpxor	%xmm15,%xmm8,%xmm8
 	vpxor	%xmm5,%xmm2,%xmm2
 	vxorps	%xmm12,%xmm10,%xmm10
 
 	vmovdqu	16(%rdx),%xmm14
 	vpalignr	$8,%xmm10,%xmm10,%xmm12
 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
 	vpshufb	%xmm13,%xmm14,%xmm14
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
 	vmovdqu	144-64(%rsi),%xmm6
 	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
 	vxorps	%xmm11,%xmm12,%xmm12
 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
 	vmovdqu	176-64(%rsi),%xmm7
 	vpxor	%xmm14,%xmm9,%xmm9
 	vpxor	%xmm2,%xmm5,%xmm5
 
 	vmovdqu	(%rdx),%xmm15
 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
 	vpshufb	%xmm13,%xmm15,%xmm15
 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
 	vmovdqu	160-64(%rsi),%xmm6
 	vpxor	%xmm12,%xmm15,%xmm15
 	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
 	vpxor	%xmm10,%xmm15,%xmm15
 
 	leaq	128(%rdx),%rdx
 	subq	$0x80,%rcx
 	jnc	.Loop8x_avx
 
 	addq	$0x80,%rcx
 	jmp	.Ltail_no_xor_avx
 
-.align	32
+.balign	32
 .Lshort_avx:
 	vmovdqu	-16(%rdx,%rcx,1),%xmm14
 	leaq	(%rdx,%rcx,1),%rdx
 	vmovdqu	0-64(%rsi),%xmm6
 	vmovdqu	32-64(%rsi),%xmm7
 	vpshufb	%xmm13,%xmm14,%xmm15
 
 	vmovdqa	%xmm0,%xmm3
 	vmovdqa	%xmm1,%xmm4
 	vmovdqa	%xmm2,%xmm5
 	subq	$0x10,%rcx
 	jz	.Ltail_avx
 
 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
 	vpxor	%xmm15,%xmm8,%xmm8
 	vmovdqu	-32(%rdx),%xmm14
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
 	vmovdqu	16-64(%rsi),%xmm6
 	vpshufb	%xmm13,%xmm14,%xmm15
 	vpxor	%xmm2,%xmm5,%xmm5
 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
 	vpsrldq	$8,%xmm7,%xmm7
 	subq	$0x10,%rcx
 	jz	.Ltail_avx
 
 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
 	vpxor	%xmm15,%xmm8,%xmm8
 	vmovdqu	-48(%rdx),%xmm14
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
 	vmovdqu	48-64(%rsi),%xmm6
 	vpshufb	%xmm13,%xmm14,%xmm15
 	vpxor	%xmm2,%xmm5,%xmm5
 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
 	vmovdqu	80-64(%rsi),%xmm7
 	subq	$0x10,%rcx
 	jz	.Ltail_avx
 
 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
 	vpxor	%xmm15,%xmm8,%xmm8
 	vmovdqu	-64(%rdx),%xmm14
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
 	vmovdqu	64-64(%rsi),%xmm6
 	vpshufb	%xmm13,%xmm14,%xmm15
 	vpxor	%xmm2,%xmm5,%xmm5
 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
 	vpsrldq	$8,%xmm7,%xmm7
 	subq	$0x10,%rcx
 	jz	.Ltail_avx
 
 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
 	vpxor	%xmm15,%xmm8,%xmm8
 	vmovdqu	-80(%rdx),%xmm14
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
 	vmovdqu	96-64(%rsi),%xmm6
 	vpshufb	%xmm13,%xmm14,%xmm15
 	vpxor	%xmm2,%xmm5,%xmm5
 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
 	vmovdqu	128-64(%rsi),%xmm7
 	subq	$0x10,%rcx
 	jz	.Ltail_avx
 
 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
 	vpxor	%xmm15,%xmm8,%xmm8
 	vmovdqu	-96(%rdx),%xmm14
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
 	vmovdqu	112-64(%rsi),%xmm6
 	vpshufb	%xmm13,%xmm14,%xmm15
 	vpxor	%xmm2,%xmm5,%xmm5
 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
 	vpsrldq	$8,%xmm7,%xmm7
 	subq	$0x10,%rcx
 	jz	.Ltail_avx
 
 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
 	vpxor	%xmm15,%xmm8,%xmm8
 	vmovdqu	-112(%rdx),%xmm14
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
 	vmovdqu	144-64(%rsi),%xmm6
 	vpshufb	%xmm13,%xmm14,%xmm15
 	vpxor	%xmm2,%xmm5,%xmm5
 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
 	vmovq	184-64(%rsi),%xmm7
 	subq	$0x10,%rcx
 	jmp	.Ltail_avx
 
-.align	32
+.balign	32
 .Ltail_avx:
 	vpxor	%xmm10,%xmm15,%xmm15
 .Ltail_no_xor_avx:
 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
 	vpxor	%xmm0,%xmm3,%xmm3
 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
 	vpxor	%xmm15,%xmm8,%xmm8
 	vpxor	%xmm1,%xmm4,%xmm4
 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
 	vpxor	%xmm2,%xmm5,%xmm5
 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
 
 	vmovdqu	(%r10),%xmm12
 
 	vpxor	%xmm0,%xmm3,%xmm10
 	vpxor	%xmm1,%xmm4,%xmm11
 	vpxor	%xmm2,%xmm5,%xmm5
 
 	vpxor	%xmm10,%xmm5,%xmm5
 	vpxor	%xmm11,%xmm5,%xmm5
 	vpslldq	$8,%xmm5,%xmm9
 	vpsrldq	$8,%xmm5,%xmm5
 	vpxor	%xmm9,%xmm10,%xmm10
 	vpxor	%xmm5,%xmm11,%xmm11
 
 	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
 	vpalignr	$8,%xmm10,%xmm10,%xmm10
 	vpxor	%xmm9,%xmm10,%xmm10
 
 	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
 	vpalignr	$8,%xmm10,%xmm10,%xmm10
 	vpxor	%xmm11,%xmm10,%xmm10
 	vpxor	%xmm9,%xmm10,%xmm10
 
 	cmpq	$0,%rcx
 	jne	.Lshort_avx
 
 	vpshufb	%xmm13,%xmm10,%xmm10
 	vmovdqu	%xmm10,(%rdi)
 	vzeroupper
 	RET
 .cfi_endproc
 SET_SIZE(gcm_ghash_avx)
 
 #endif /* !_WIN32 || _KERNEL */
 
 SECTION_STATIC
-.align	64
+.balign	64
 .Lbswap_mask:
 .byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 .L0x1c2_polynomial:
 .byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
 .L7_mask:
 .long	7,0,7,0
 .L7_mask_poly:
 .long	7,0,450,0
-.align	64
+.balign	64
 SET_OBJ(.Lrem_4bit)
 .Lrem_4bit:
 .long	0,0,0,471859200,0,943718400,0,610271232
 .long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
 .long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
 .long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
 SET_OBJ(.Lrem_8bit)
 .Lrem_8bit:
 .value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
 .value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
 .value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
 .value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
 .value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
 .value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
 .value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
 .value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
 .value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
 .value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
 .value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
 .value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
 .value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
 .value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
 .value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
 .value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
 .value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
 .value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
 .value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
 .value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
 .value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
 .value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
 .value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
 .value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
 .value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
 .value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
 .value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
 .value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
 .value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
 .value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
 .value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
 .value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
 
 .byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	64
+.balign	64
 
 /* Mark the stack non-executable. */
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
 
 #endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S
index 321d5da461db..f3d701528459 100644
--- a/module/icp/asm-x86_64/sha2/sha256_impl.S
+++ b/module/icp/asm-x86_64/sha2/sha256_impl.S
@@ -1,2090 +1,2090 @@
 /*
  * ====================================================================
  * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  * project. Rights for redistribution and usage in source and binary
  * forms are granted according to the OpenSSL license.
  * ====================================================================
  *
  * sha256/512_block procedure for x86_64.
  *
  * 40% improvement over compiler-generated code on Opteron. On EM64T
  * sha256 was observed to run >80% faster and sha512 - >40%. No magical
  * tricks, just straight implementation... I really wonder why gcc
  * [being armed with inline assembler] fails to generate as fast code.
  * The only thing which is cool about this module is that it's very
  * same instruction sequence used for both SHA-256 and SHA-512. In
  * former case the instructions operate on 32-bit operands, while in
  * latter - on 64-bit ones. All I had to do is to get one flavor right,
  * the other one passed the test right away:-)
  *
  * sha256_block runs in ~1005 cycles on Opteron, which gives you
  * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
  * frequency in GHz. sha512_block runs in ~1275 cycles, which results
  * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
  * Well, if you compare it to IA-64 implementation, which maintains
  * X[16] in register bank[!], tends to 4 instructions per CPU clock
  * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
  * issue Opteron pipeline and X[16] maintained in memory. So that *if*
  * there is a way to improve it, *then* the only way would be to try to
  * offload X[16] updates to SSE unit, but that would require "deeper"
  * loop unroll, which in turn would naturally cause size blow-up, not
  * to mention increased complexity! And once again, only *if* it's
  * actually possible to noticeably improve overall ILP, instruction
  * level parallelism, on a given CPU implementation in this case.
  *
  * Special note on Intel EM64T. While Opteron CPU exhibits perfect
  * performance ratio of 1.5 between 64- and 32-bit flavors [see above],
  * [currently available] EM64T CPUs apparently are far from it. On the
  * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
  * sha256_block:-( This is presumably because 64-bit shifts/rotates
  * apparently are not atomic instructions, but implemented in microcode.
  */
 
 /*
  * OpenSolaris OS modifications
  *
  * Sun elects to use this software under the BSD license.
  *
  * This source originates from OpenSSL file sha512-x86_64.pl at
  * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
  * (presumably for future OpenSSL release 0.9.8h), with these changes:
  *
  * 1. Added perl "use strict" and declared variables.
  *
  * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
  * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
  *
  * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
  * assemblers).  Replaced the .picmeup macro with assembler code.
  *
  * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
  * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
  */
 
 /*
  * This file was generated by a perl script (sha512-x86_64.pl) that were
  * used to generate sha256 and sha512 variants from the same code base.
  * The comments from the original file have been pasted above.
  */
 
 #if defined(lint) || defined(__lint)
 #include <sys/stdint.h>
 #include <sha2/sha2.h>
 
 void
 SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
 {
 	(void) ctx, (void) in, (void) num;
 }
 
 
 #else
 #define _ASM
 #include <sys/asm_linkage.h>
 
 ENTRY_NP(SHA256TransformBlocks)
 .cfi_startproc
 	ENDBR
 	movq	%rsp, %rax
 .cfi_def_cfa_register %rax
 	push	%rbx
 .cfi_offset	%rbx,-16
 	push	%rbp
 .cfi_offset	%rbp,-24
 	push	%r12
 .cfi_offset	%r12,-32
 	push	%r13
 .cfi_offset	%r13,-40
 	push	%r14
 .cfi_offset	%r14,-48
 	push	%r15
 .cfi_offset	%r15,-56
 	mov	%rsp,%rbp		# copy %rsp
 	shl	$4,%rdx		# num*16
 	sub	$16*4+4*8,%rsp
 	lea	(%rsi,%rdx,4),%rdx	# inp+num*16*4
 	and	$-64,%rsp		# align stack frame
 	add	$8,%rdi		# Skip OpenSolaris field, "algotype"
 	mov	%rdi,16*4+0*8(%rsp)		# save ctx, 1st arg
 	mov	%rsi,16*4+1*8(%rsp)		# save inp, 2nd arg
 	mov	%rdx,16*4+2*8(%rsp)		# save end pointer, "3rd" arg
 	mov	%rbp,16*4+3*8(%rsp)		# save copy of %rsp
 # echo ".cfi_cfa_expression %rsp+88,deref,+56" |
 #	openssl/crypto/perlasm/x86_64-xlate.pl
 .cfi_escape	0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x38
 
 	#.picmeup %rbp
 	# The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
 	# the address of the "next" instruction into the target register
 	# (%rbp).  This generates these 2 instructions:
 	lea	.Llea(%rip),%rbp
 	#nop	# .picmeup generates a nop for mod 8 alignment--not needed here
 
 .Llea:
 	lea	K256-.(%rbp),%rbp
 
 	mov	4*0(%rdi),%eax
 	mov	4*1(%rdi),%ebx
 	mov	4*2(%rdi),%ecx
 	mov	4*3(%rdi),%edx
 	mov	4*4(%rdi),%r8d
 	mov	4*5(%rdi),%r9d
 	mov	4*6(%rdi),%r10d
 	mov	4*7(%rdi),%r11d
 	jmp	.Lloop
 
-.align	16
+.balign	16
 .Lloop:
 	xor	%rdi,%rdi
 	mov	4*0(%rsi),%r12d
 	bswap	%r12d
 	mov	%r8d,%r13d
 	mov	%r8d,%r14d
 	mov	%r9d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r10d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r8d,%r15d			# (f^g)&e
 	mov	%r12d,0(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r10d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r11d,%r12d			# T1+=h
 
 	mov	%eax,%r11d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%eax,%r13d
 	mov	%eax,%r14d
 
 	ror	$2,%r11d
 	ror	$13,%r13d
 	mov	%eax,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r11d
 	ror	$9,%r13d
 	or	%ecx,%r14d			# a|c
 
 	xor	%r13d,%r11d			# h=Sigma0(a)
 	and	%ecx,%r15d			# a&c
 	add	%r12d,%edx			# d+=T1
 
 	and	%ebx,%r14d			# (a|c)&b
 	add	%r12d,%r11d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r11d			# h+=Maj(a,b,c)
 	mov	4*1(%rsi),%r12d
 	bswap	%r12d
 	mov	%edx,%r13d
 	mov	%edx,%r14d
 	mov	%r8d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r9d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%edx,%r15d			# (f^g)&e
 	mov	%r12d,4(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r9d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r10d,%r12d			# T1+=h
 
 	mov	%r11d,%r10d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r11d,%r13d
 	mov	%r11d,%r14d
 
 	ror	$2,%r10d
 	ror	$13,%r13d
 	mov	%r11d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r10d
 	ror	$9,%r13d
 	or	%ebx,%r14d			# a|c
 
 	xor	%r13d,%r10d			# h=Sigma0(a)
 	and	%ebx,%r15d			# a&c
 	add	%r12d,%ecx			# d+=T1
 
 	and	%eax,%r14d			# (a|c)&b
 	add	%r12d,%r10d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r10d			# h+=Maj(a,b,c)
 	mov	4*2(%rsi),%r12d
 	bswap	%r12d
 	mov	%ecx,%r13d
 	mov	%ecx,%r14d
 	mov	%edx,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r8d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%ecx,%r15d			# (f^g)&e
 	mov	%r12d,8(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r8d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r9d,%r12d			# T1+=h
 
 	mov	%r10d,%r9d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r10d,%r13d
 	mov	%r10d,%r14d
 
 	ror	$2,%r9d
 	ror	$13,%r13d
 	mov	%r10d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r9d
 	ror	$9,%r13d
 	or	%eax,%r14d			# a|c
 
 	xor	%r13d,%r9d			# h=Sigma0(a)
 	and	%eax,%r15d			# a&c
 	add	%r12d,%ebx			# d+=T1
 
 	and	%r11d,%r14d			# (a|c)&b
 	add	%r12d,%r9d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r9d			# h+=Maj(a,b,c)
 	mov	4*3(%rsi),%r12d
 	bswap	%r12d
 	mov	%ebx,%r13d
 	mov	%ebx,%r14d
 	mov	%ecx,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%edx,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%ebx,%r15d			# (f^g)&e
 	mov	%r12d,12(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%edx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r8d,%r12d			# T1+=h
 
 	mov	%r9d,%r8d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r9d,%r13d
 	mov	%r9d,%r14d
 
 	ror	$2,%r8d
 	ror	$13,%r13d
 	mov	%r9d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r8d
 	ror	$9,%r13d
 	or	%r11d,%r14d			# a|c
 
 	xor	%r13d,%r8d			# h=Sigma0(a)
 	and	%r11d,%r15d			# a&c
 	add	%r12d,%eax			# d+=T1
 
 	and	%r10d,%r14d			# (a|c)&b
 	add	%r12d,%r8d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r8d			# h+=Maj(a,b,c)
 	mov	4*4(%rsi),%r12d
 	bswap	%r12d
 	mov	%eax,%r13d
 	mov	%eax,%r14d
 	mov	%ebx,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%ecx,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%eax,%r15d			# (f^g)&e
 	mov	%r12d,16(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%ecx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%edx,%r12d			# T1+=h
 
 	mov	%r8d,%edx
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r8d,%r13d
 	mov	%r8d,%r14d
 
 	ror	$2,%edx
 	ror	$13,%r13d
 	mov	%r8d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%edx
 	ror	$9,%r13d
 	or	%r10d,%r14d			# a|c
 
 	xor	%r13d,%edx			# h=Sigma0(a)
 	and	%r10d,%r15d			# a&c
 	add	%r12d,%r11d			# d+=T1
 
 	and	%r9d,%r14d			# (a|c)&b
 	add	%r12d,%edx			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%edx			# h+=Maj(a,b,c)
 	mov	4*5(%rsi),%r12d
 	bswap	%r12d
 	mov	%r11d,%r13d
 	mov	%r11d,%r14d
 	mov	%eax,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%ebx,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r11d,%r15d			# (f^g)&e
 	mov	%r12d,20(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%ebx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%ecx,%r12d			# T1+=h
 
 	mov	%edx,%ecx
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%edx,%r13d
 	mov	%edx,%r14d
 
 	ror	$2,%ecx
 	ror	$13,%r13d
 	mov	%edx,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%ecx
 	ror	$9,%r13d
 	or	%r9d,%r14d			# a|c
 
 	xor	%r13d,%ecx			# h=Sigma0(a)
 	and	%r9d,%r15d			# a&c
 	add	%r12d,%r10d			# d+=T1
 
 	and	%r8d,%r14d			# (a|c)&b
 	add	%r12d,%ecx			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%ecx			# h+=Maj(a,b,c)
 	mov	4*6(%rsi),%r12d
 	bswap	%r12d
 	mov	%r10d,%r13d
 	mov	%r10d,%r14d
 	mov	%r11d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%eax,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r10d,%r15d			# (f^g)&e
 	mov	%r12d,24(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%eax,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%ebx,%r12d			# T1+=h
 
 	mov	%ecx,%ebx
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%ecx,%r13d
 	mov	%ecx,%r14d
 
 	ror	$2,%ebx
 	ror	$13,%r13d
 	mov	%ecx,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%ebx
 	ror	$9,%r13d
 	or	%r8d,%r14d			# a|c
 
 	xor	%r13d,%ebx			# h=Sigma0(a)
 	and	%r8d,%r15d			# a&c
 	add	%r12d,%r9d			# d+=T1
 
 	and	%edx,%r14d			# (a|c)&b
 	add	%r12d,%ebx			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%ebx			# h+=Maj(a,b,c)
 	mov	4*7(%rsi),%r12d
 	bswap	%r12d
 	mov	%r9d,%r13d
 	mov	%r9d,%r14d
 	mov	%r10d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r11d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r9d,%r15d			# (f^g)&e
 	mov	%r12d,28(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r11d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%eax,%r12d			# T1+=h
 
 	mov	%ebx,%eax
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%ebx,%r13d
 	mov	%ebx,%r14d
 
 	ror	$2,%eax
 	ror	$13,%r13d
 	mov	%ebx,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%eax
 	ror	$9,%r13d
 	or	%edx,%r14d			# a|c
 
 	xor	%r13d,%eax			# h=Sigma0(a)
 	and	%edx,%r15d			# a&c
 	add	%r12d,%r8d			# d+=T1
 
 	and	%ecx,%r14d			# (a|c)&b
 	add	%r12d,%eax			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%eax			# h+=Maj(a,b,c)
 	mov	4*8(%rsi),%r12d
 	bswap	%r12d
 	mov	%r8d,%r13d
 	mov	%r8d,%r14d
 	mov	%r9d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r10d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r8d,%r15d			# (f^g)&e
 	mov	%r12d,32(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r10d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r11d,%r12d			# T1+=h
 
 	mov	%eax,%r11d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%eax,%r13d
 	mov	%eax,%r14d
 
 	ror	$2,%r11d
 	ror	$13,%r13d
 	mov	%eax,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r11d
 	ror	$9,%r13d
 	or	%ecx,%r14d			# a|c
 
 	xor	%r13d,%r11d			# h=Sigma0(a)
 	and	%ecx,%r15d			# a&c
 	add	%r12d,%edx			# d+=T1
 
 	and	%ebx,%r14d			# (a|c)&b
 	add	%r12d,%r11d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r11d			# h+=Maj(a,b,c)
 	mov	4*9(%rsi),%r12d
 	bswap	%r12d
 	mov	%edx,%r13d
 	mov	%edx,%r14d
 	mov	%r8d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r9d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%edx,%r15d			# (f^g)&e
 	mov	%r12d,36(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r9d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r10d,%r12d			# T1+=h
 
 	mov	%r11d,%r10d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r11d,%r13d
 	mov	%r11d,%r14d
 
 	ror	$2,%r10d
 	ror	$13,%r13d
 	mov	%r11d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r10d
 	ror	$9,%r13d
 	or	%ebx,%r14d			# a|c
 
 	xor	%r13d,%r10d			# h=Sigma0(a)
 	and	%ebx,%r15d			# a&c
 	add	%r12d,%ecx			# d+=T1
 
 	and	%eax,%r14d			# (a|c)&b
 	add	%r12d,%r10d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r10d			# h+=Maj(a,b,c)
 	mov	4*10(%rsi),%r12d
 	bswap	%r12d
 	mov	%ecx,%r13d
 	mov	%ecx,%r14d
 	mov	%edx,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r8d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%ecx,%r15d			# (f^g)&e
 	mov	%r12d,40(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r8d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r9d,%r12d			# T1+=h
 
 	mov	%r10d,%r9d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r10d,%r13d
 	mov	%r10d,%r14d
 
 	ror	$2,%r9d
 	ror	$13,%r13d
 	mov	%r10d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r9d
 	ror	$9,%r13d
 	or	%eax,%r14d			# a|c
 
 	xor	%r13d,%r9d			# h=Sigma0(a)
 	and	%eax,%r15d			# a&c
 	add	%r12d,%ebx			# d+=T1
 
 	and	%r11d,%r14d			# (a|c)&b
 	add	%r12d,%r9d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r9d			# h+=Maj(a,b,c)
 	mov	4*11(%rsi),%r12d
 	bswap	%r12d
 	mov	%ebx,%r13d
 	mov	%ebx,%r14d
 	mov	%ecx,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%edx,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%ebx,%r15d			# (f^g)&e
 	mov	%r12d,44(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%edx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r8d,%r12d			# T1+=h
 
 	mov	%r9d,%r8d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r9d,%r13d
 	mov	%r9d,%r14d
 
 	ror	$2,%r8d
 	ror	$13,%r13d
 	mov	%r9d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r8d
 	ror	$9,%r13d
 	or	%r11d,%r14d			# a|c
 
 	xor	%r13d,%r8d			# h=Sigma0(a)
 	and	%r11d,%r15d			# a&c
 	add	%r12d,%eax			# d+=T1
 
 	and	%r10d,%r14d			# (a|c)&b
 	add	%r12d,%r8d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r8d			# h+=Maj(a,b,c)
 	mov	4*12(%rsi),%r12d
 	bswap	%r12d
 	mov	%eax,%r13d
 	mov	%eax,%r14d
 	mov	%ebx,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%ecx,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%eax,%r15d			# (f^g)&e
 	mov	%r12d,48(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%ecx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%edx,%r12d			# T1+=h
 
 	mov	%r8d,%edx
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r8d,%r13d
 	mov	%r8d,%r14d
 
 	ror	$2,%edx
 	ror	$13,%r13d
 	mov	%r8d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%edx
 	ror	$9,%r13d
 	or	%r10d,%r14d			# a|c
 
 	xor	%r13d,%edx			# h=Sigma0(a)
 	and	%r10d,%r15d			# a&c
 	add	%r12d,%r11d			# d+=T1
 
 	and	%r9d,%r14d			# (a|c)&b
 	add	%r12d,%edx			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%edx			# h+=Maj(a,b,c)
 	mov	4*13(%rsi),%r12d
 	bswap	%r12d
 	mov	%r11d,%r13d
 	mov	%r11d,%r14d
 	mov	%eax,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%ebx,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r11d,%r15d			# (f^g)&e
 	mov	%r12d,52(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%ebx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%ecx,%r12d			# T1+=h
 
 	mov	%edx,%ecx
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%edx,%r13d
 	mov	%edx,%r14d
 
 	ror	$2,%ecx
 	ror	$13,%r13d
 	mov	%edx,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%ecx
 	ror	$9,%r13d
 	or	%r9d,%r14d			# a|c
 
 	xor	%r13d,%ecx			# h=Sigma0(a)
 	and	%r9d,%r15d			# a&c
 	add	%r12d,%r10d			# d+=T1
 
 	and	%r8d,%r14d			# (a|c)&b
 	add	%r12d,%ecx			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%ecx			# h+=Maj(a,b,c)
 	mov	4*14(%rsi),%r12d
 	bswap	%r12d
 	mov	%r10d,%r13d
 	mov	%r10d,%r14d
 	mov	%r11d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%eax,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r10d,%r15d			# (f^g)&e
 	mov	%r12d,56(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%eax,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%ebx,%r12d			# T1+=h
 
 	mov	%ecx,%ebx
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%ecx,%r13d
 	mov	%ecx,%r14d
 
 	ror	$2,%ebx
 	ror	$13,%r13d
 	mov	%ecx,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%ebx
 	ror	$9,%r13d
 	or	%r8d,%r14d			# a|c
 
 	xor	%r13d,%ebx			# h=Sigma0(a)
 	and	%r8d,%r15d			# a&c
 	add	%r12d,%r9d			# d+=T1
 
 	and	%edx,%r14d			# (a|c)&b
 	add	%r12d,%ebx			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%ebx			# h+=Maj(a,b,c)
 	mov	4*15(%rsi),%r12d
 	bswap	%r12d
 	mov	%r9d,%r13d
 	mov	%r9d,%r14d
 	mov	%r10d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r11d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r9d,%r15d			# (f^g)&e
 	mov	%r12d,60(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r11d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%eax,%r12d			# T1+=h
 
 	mov	%ebx,%eax
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%ebx,%r13d
 	mov	%ebx,%r14d
 
 	ror	$2,%eax
 	ror	$13,%r13d
 	mov	%ebx,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%eax
 	ror	$9,%r13d
 	or	%edx,%r14d			# a|c
 
 	xor	%r13d,%eax			# h=Sigma0(a)
 	and	%edx,%r15d			# a&c
 	add	%r12d,%r8d			# d+=T1
 
 	and	%ecx,%r14d			# (a|c)&b
 	add	%r12d,%eax			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%eax			# h+=Maj(a,b,c)
 	jmp	.Lrounds_16_xx
-.align	16
+.balign	16
 .Lrounds_16_xx:
 	mov	4(%rsp),%r13d
 	mov	56(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	36(%rsp),%r12d
 
 	add	0(%rsp),%r12d
 	mov	%r8d,%r13d
 	mov	%r8d,%r14d
 	mov	%r9d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r10d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r8d,%r15d			# (f^g)&e
 	mov	%r12d,0(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r10d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r11d,%r12d			# T1+=h
 
 	mov	%eax,%r11d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%eax,%r13d
 	mov	%eax,%r14d
 
 	ror	$2,%r11d
 	ror	$13,%r13d
 	mov	%eax,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r11d
 	ror	$9,%r13d
 	or	%ecx,%r14d			# a|c
 
 	xor	%r13d,%r11d			# h=Sigma0(a)
 	and	%ecx,%r15d			# a&c
 	add	%r12d,%edx			# d+=T1
 
 	and	%ebx,%r14d			# (a|c)&b
 	add	%r12d,%r11d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r11d			# h+=Maj(a,b,c)
 	mov	8(%rsp),%r13d
 	mov	60(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	40(%rsp),%r12d
 
 	add	4(%rsp),%r12d
 	mov	%edx,%r13d
 	mov	%edx,%r14d
 	mov	%r8d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r9d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%edx,%r15d			# (f^g)&e
 	mov	%r12d,4(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r9d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r10d,%r12d			# T1+=h
 
 	mov	%r11d,%r10d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r11d,%r13d
 	mov	%r11d,%r14d
 
 	ror	$2,%r10d
 	ror	$13,%r13d
 	mov	%r11d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r10d
 	ror	$9,%r13d
 	or	%ebx,%r14d			# a|c
 
 	xor	%r13d,%r10d			# h=Sigma0(a)
 	and	%ebx,%r15d			# a&c
 	add	%r12d,%ecx			# d+=T1
 
 	and	%eax,%r14d			# (a|c)&b
 	add	%r12d,%r10d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r10d			# h+=Maj(a,b,c)
 	mov	12(%rsp),%r13d
 	mov	0(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	44(%rsp),%r12d
 
 	add	8(%rsp),%r12d
 	mov	%ecx,%r13d
 	mov	%ecx,%r14d
 	mov	%edx,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r8d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%ecx,%r15d			# (f^g)&e
 	mov	%r12d,8(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r8d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r9d,%r12d			# T1+=h
 
 	mov	%r10d,%r9d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r10d,%r13d
 	mov	%r10d,%r14d
 
 	ror	$2,%r9d
 	ror	$13,%r13d
 	mov	%r10d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r9d
 	ror	$9,%r13d
 	or	%eax,%r14d			# a|c
 
 	xor	%r13d,%r9d			# h=Sigma0(a)
 	and	%eax,%r15d			# a&c
 	add	%r12d,%ebx			# d+=T1
 
 	and	%r11d,%r14d			# (a|c)&b
 	add	%r12d,%r9d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r9d			# h+=Maj(a,b,c)
 	mov	16(%rsp),%r13d
 	mov	4(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	48(%rsp),%r12d
 
 	add	12(%rsp),%r12d
 	mov	%ebx,%r13d
 	mov	%ebx,%r14d
 	mov	%ecx,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%edx,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%ebx,%r15d			# (f^g)&e
 	mov	%r12d,12(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%edx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r8d,%r12d			# T1+=h
 
 	mov	%r9d,%r8d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r9d,%r13d
 	mov	%r9d,%r14d
 
 	ror	$2,%r8d
 	ror	$13,%r13d
 	mov	%r9d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r8d
 	ror	$9,%r13d
 	or	%r11d,%r14d			# a|c
 
 	xor	%r13d,%r8d			# h=Sigma0(a)
 	and	%r11d,%r15d			# a&c
 	add	%r12d,%eax			# d+=T1
 
 	and	%r10d,%r14d			# (a|c)&b
 	add	%r12d,%r8d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r8d			# h+=Maj(a,b,c)
 	mov	20(%rsp),%r13d
 	mov	8(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	52(%rsp),%r12d
 
 	add	16(%rsp),%r12d
 	mov	%eax,%r13d
 	mov	%eax,%r14d
 	mov	%ebx,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%ecx,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%eax,%r15d			# (f^g)&e
 	mov	%r12d,16(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%ecx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%edx,%r12d			# T1+=h
 
 	mov	%r8d,%edx
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r8d,%r13d
 	mov	%r8d,%r14d
 
 	ror	$2,%edx
 	ror	$13,%r13d
 	mov	%r8d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%edx
 	ror	$9,%r13d
 	or	%r10d,%r14d			# a|c
 
 	xor	%r13d,%edx			# h=Sigma0(a)
 	and	%r10d,%r15d			# a&c
 	add	%r12d,%r11d			# d+=T1
 
 	and	%r9d,%r14d			# (a|c)&b
 	add	%r12d,%edx			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%edx			# h+=Maj(a,b,c)
 	mov	24(%rsp),%r13d
 	mov	12(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	56(%rsp),%r12d
 
 	add	20(%rsp),%r12d
 	mov	%r11d,%r13d
 	mov	%r11d,%r14d
 	mov	%eax,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%ebx,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r11d,%r15d			# (f^g)&e
 	mov	%r12d,20(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%ebx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%ecx,%r12d			# T1+=h
 
 	mov	%edx,%ecx
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%edx,%r13d
 	mov	%edx,%r14d
 
 	ror	$2,%ecx
 	ror	$13,%r13d
 	mov	%edx,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%ecx
 	ror	$9,%r13d
 	or	%r9d,%r14d			# a|c
 
 	xor	%r13d,%ecx			# h=Sigma0(a)
 	and	%r9d,%r15d			# a&c
 	add	%r12d,%r10d			# d+=T1
 
 	and	%r8d,%r14d			# (a|c)&b
 	add	%r12d,%ecx			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%ecx			# h+=Maj(a,b,c)
 	mov	28(%rsp),%r13d
 	mov	16(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	60(%rsp),%r12d
 
 	add	24(%rsp),%r12d
 	mov	%r10d,%r13d
 	mov	%r10d,%r14d
 	mov	%r11d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%eax,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r10d,%r15d			# (f^g)&e
 	mov	%r12d,24(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%eax,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%ebx,%r12d			# T1+=h
 
 	mov	%ecx,%ebx
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%ecx,%r13d
 	mov	%ecx,%r14d
 
 	ror	$2,%ebx
 	ror	$13,%r13d
 	mov	%ecx,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%ebx
 	ror	$9,%r13d
 	or	%r8d,%r14d			# a|c
 
 	xor	%r13d,%ebx			# h=Sigma0(a)
 	and	%r8d,%r15d			# a&c
 	add	%r12d,%r9d			# d+=T1
 
 	and	%edx,%r14d			# (a|c)&b
 	add	%r12d,%ebx			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%ebx			# h+=Maj(a,b,c)
 	mov	32(%rsp),%r13d
 	mov	20(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	0(%rsp),%r12d
 
 	add	28(%rsp),%r12d
 	mov	%r9d,%r13d
 	mov	%r9d,%r14d
 	mov	%r10d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r11d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r9d,%r15d			# (f^g)&e
 	mov	%r12d,28(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r11d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%eax,%r12d			# T1+=h
 
 	mov	%ebx,%eax
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%ebx,%r13d
 	mov	%ebx,%r14d
 
 	ror	$2,%eax
 	ror	$13,%r13d
 	mov	%ebx,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%eax
 	ror	$9,%r13d
 	or	%edx,%r14d			# a|c
 
 	xor	%r13d,%eax			# h=Sigma0(a)
 	and	%edx,%r15d			# a&c
 	add	%r12d,%r8d			# d+=T1
 
 	and	%ecx,%r14d			# (a|c)&b
 	add	%r12d,%eax			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%eax			# h+=Maj(a,b,c)
 	mov	36(%rsp),%r13d
 	mov	24(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	4(%rsp),%r12d
 
 	add	32(%rsp),%r12d
 	mov	%r8d,%r13d
 	mov	%r8d,%r14d
 	mov	%r9d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r10d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r8d,%r15d			# (f^g)&e
 	mov	%r12d,32(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r10d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r11d,%r12d			# T1+=h
 
 	mov	%eax,%r11d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%eax,%r13d
 	mov	%eax,%r14d
 
 	ror	$2,%r11d
 	ror	$13,%r13d
 	mov	%eax,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r11d
 	ror	$9,%r13d
 	or	%ecx,%r14d			# a|c
 
 	xor	%r13d,%r11d			# h=Sigma0(a)
 	and	%ecx,%r15d			# a&c
 	add	%r12d,%edx			# d+=T1
 
 	and	%ebx,%r14d			# (a|c)&b
 	add	%r12d,%r11d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r11d			# h+=Maj(a,b,c)
 	mov	40(%rsp),%r13d
 	mov	28(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	8(%rsp),%r12d
 
 	add	36(%rsp),%r12d
 	mov	%edx,%r13d
 	mov	%edx,%r14d
 	mov	%r8d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r9d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%edx,%r15d			# (f^g)&e
 	mov	%r12d,36(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r9d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r10d,%r12d			# T1+=h
 
 	mov	%r11d,%r10d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r11d,%r13d
 	mov	%r11d,%r14d
 
 	ror	$2,%r10d
 	ror	$13,%r13d
 	mov	%r11d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r10d
 	ror	$9,%r13d
 	or	%ebx,%r14d			# a|c
 
 	xor	%r13d,%r10d			# h=Sigma0(a)
 	and	%ebx,%r15d			# a&c
 	add	%r12d,%ecx			# d+=T1
 
 	and	%eax,%r14d			# (a|c)&b
 	add	%r12d,%r10d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r10d			# h+=Maj(a,b,c)
 	mov	44(%rsp),%r13d
 	mov	32(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	12(%rsp),%r12d
 
 	add	40(%rsp),%r12d
 	mov	%ecx,%r13d
 	mov	%ecx,%r14d
 	mov	%edx,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r8d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%ecx,%r15d			# (f^g)&e
 	mov	%r12d,40(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r8d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r9d,%r12d			# T1+=h
 
 	mov	%r10d,%r9d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r10d,%r13d
 	mov	%r10d,%r14d
 
 	ror	$2,%r9d
 	ror	$13,%r13d
 	mov	%r10d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r9d
 	ror	$9,%r13d
 	or	%eax,%r14d			# a|c
 
 	xor	%r13d,%r9d			# h=Sigma0(a)
 	and	%eax,%r15d			# a&c
 	add	%r12d,%ebx			# d+=T1
 
 	and	%r11d,%r14d			# (a|c)&b
 	add	%r12d,%r9d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r9d			# h+=Maj(a,b,c)
 	mov	48(%rsp),%r13d
 	mov	36(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	16(%rsp),%r12d
 
 	add	44(%rsp),%r12d
 	mov	%ebx,%r13d
 	mov	%ebx,%r14d
 	mov	%ecx,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%edx,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%ebx,%r15d			# (f^g)&e
 	mov	%r12d,44(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%edx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r8d,%r12d			# T1+=h
 
 	mov	%r9d,%r8d
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r9d,%r13d
 	mov	%r9d,%r14d
 
 	ror	$2,%r8d
 	ror	$13,%r13d
 	mov	%r9d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%r8d
 	ror	$9,%r13d
 	or	%r11d,%r14d			# a|c
 
 	xor	%r13d,%r8d			# h=Sigma0(a)
 	and	%r11d,%r15d			# a&c
 	add	%r12d,%eax			# d+=T1
 
 	and	%r10d,%r14d			# (a|c)&b
 	add	%r12d,%r8d			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%r8d			# h+=Maj(a,b,c)
 	mov	52(%rsp),%r13d
 	mov	40(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	20(%rsp),%r12d
 
 	add	48(%rsp),%r12d
 	mov	%eax,%r13d
 	mov	%eax,%r14d
 	mov	%ebx,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%ecx,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%eax,%r15d			# (f^g)&e
 	mov	%r12d,48(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%ecx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%edx,%r12d			# T1+=h
 
 	mov	%r8d,%edx
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%r8d,%r13d
 	mov	%r8d,%r14d
 
 	ror	$2,%edx
 	ror	$13,%r13d
 	mov	%r8d,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%edx
 	ror	$9,%r13d
 	or	%r10d,%r14d			# a|c
 
 	xor	%r13d,%edx			# h=Sigma0(a)
 	and	%r10d,%r15d			# a&c
 	add	%r12d,%r11d			# d+=T1
 
 	and	%r9d,%r14d			# (a|c)&b
 	add	%r12d,%edx			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%edx			# h+=Maj(a,b,c)
 	mov	56(%rsp),%r13d
 	mov	44(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	24(%rsp),%r12d
 
 	add	52(%rsp),%r12d
 	mov	%r11d,%r13d
 	mov	%r11d,%r14d
 	mov	%eax,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%ebx,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r11d,%r15d			# (f^g)&e
 	mov	%r12d,52(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%ebx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%ecx,%r12d			# T1+=h
 
 	mov	%edx,%ecx
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%edx,%r13d
 	mov	%edx,%r14d
 
 	ror	$2,%ecx
 	ror	$13,%r13d
 	mov	%edx,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%ecx
 	ror	$9,%r13d
 	or	%r9d,%r14d			# a|c
 
 	xor	%r13d,%ecx			# h=Sigma0(a)
 	and	%r9d,%r15d			# a&c
 	add	%r12d,%r10d			# d+=T1
 
 	and	%r8d,%r14d			# (a|c)&b
 	add	%r12d,%ecx			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%ecx			# h+=Maj(a,b,c)
 	mov	60(%rsp),%r13d
 	mov	48(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	28(%rsp),%r12d
 
 	add	56(%rsp),%r12d
 	mov	%r10d,%r13d
 	mov	%r10d,%r14d
 	mov	%r11d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%eax,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r10d,%r15d			# (f^g)&e
 	mov	%r12d,56(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%eax,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%ebx,%r12d			# T1+=h
 
 	mov	%ecx,%ebx
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%ecx,%r13d
 	mov	%ecx,%r14d
 
 	ror	$2,%ebx
 	ror	$13,%r13d
 	mov	%ecx,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%ebx
 	ror	$9,%r13d
 	or	%r8d,%r14d			# a|c
 
 	xor	%r13d,%ebx			# h=Sigma0(a)
 	and	%r8d,%r15d			# a&c
 	add	%r12d,%r9d			# d+=T1
 
 	and	%edx,%r14d			# (a|c)&b
 	add	%r12d,%ebx			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%ebx			# h+=Maj(a,b,c)
 	mov	0(%rsp),%r13d
 	mov	52(%rsp),%r12d
 
 	mov	%r13d,%r15d
 
 	shr	$3,%r13d
 	ror	$7,%r15d
 
 	xor	%r15d,%r13d
 	ror	$11,%r15d
 
 	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
 	mov	%r12d,%r14d
 
 	shr	$10,%r12d
 	ror	$17,%r14d
 
 	xor	%r14d,%r12d
 	ror	$2,%r14d
 
 	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
 
 	add	%r13d,%r12d
 
 	add	32(%rsp),%r12d
 
 	add	60(%rsp),%r12d
 	mov	%r9d,%r13d
 	mov	%r9d,%r14d
 	mov	%r10d,%r15d
 
 	ror	$6,%r13d
 	ror	$11,%r14d
 	xor	%r11d,%r15d			# f^g
 
 	xor	%r14d,%r13d
 	ror	$14,%r14d
 	and	%r9d,%r15d			# (f^g)&e
 	mov	%r12d,60(%rsp)
 
 	xor	%r14d,%r13d			# Sigma1(e)
 	xor	%r11d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
 	add	%eax,%r12d			# T1+=h
 
 	mov	%ebx,%eax
 	add	%r13d,%r12d			# T1+=Sigma1(e)
 
 	add	%r15d,%r12d			# T1+=Ch(e,f,g)
 	mov	%ebx,%r13d
 	mov	%ebx,%r14d
 
 	ror	$2,%eax
 	ror	$13,%r13d
 	mov	%ebx,%r15d
 	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
 
 	xor	%r13d,%eax
 	ror	$9,%r13d
 	or	%edx,%r14d			# a|c
 
 	xor	%r13d,%eax			# h=Sigma0(a)
 	and	%edx,%r15d			# a&c
 	add	%r12d,%r8d			# d+=T1
 
 	and	%ecx,%r14d			# (a|c)&b
 	add	%r12d,%eax			# h+=T1
 
 	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14d,%eax			# h+=Maj(a,b,c)
 	cmp	$64,%rdi
 	jb	.Lrounds_16_xx
 
 	mov	16*4+0*8(%rsp),%rdi
 	lea	16*4(%rsi),%rsi
 
 	add	4*0(%rdi),%eax
 	add	4*1(%rdi),%ebx
 	add	4*2(%rdi),%ecx
 	add	4*3(%rdi),%edx
 	add	4*4(%rdi),%r8d
 	add	4*5(%rdi),%r9d
 	add	4*6(%rdi),%r10d
 	add	4*7(%rdi),%r11d
 
 	cmp	16*4+2*8(%rsp),%rsi
 
 	mov	%eax,4*0(%rdi)
 	mov	%ebx,4*1(%rdi)
 	mov	%ecx,4*2(%rdi)
 	mov	%edx,4*3(%rdi)
 	mov	%r8d,4*4(%rdi)
 	mov	%r9d,4*5(%rdi)
 	mov	%r10d,4*6(%rdi)
 	mov	%r11d,4*7(%rdi)
 	jb	.Lloop
 
 	mov	16*4+3*8(%rsp),%rsp
 .cfi_def_cfa	%rsp,56
 	pop	%r15
 .cfi_adjust_cfa_offset -8
 .cfi_restore	%r15
 	pop	%r14
 .cfi_adjust_cfa_offset -8
 .cfi_restore	%r14
 	pop	%r13
 .cfi_adjust_cfa_offset -8
 .cfi_restore	%r13
 	pop	%r12
 .cfi_adjust_cfa_offset -8
 .cfi_restore	%r12
 	pop	%rbp
 .cfi_adjust_cfa_offset -8
 .cfi_restore	%rbp
 	pop	%rbx
 .cfi_adjust_cfa_offset -8
 .cfi_restore	%rbx
 
 	RET
 .cfi_endproc
 SET_SIZE(SHA256TransformBlocks)
 
 .section .rodata
-.align	64
+.balign	64
 SET_OBJ(K256)
 K256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 #endif /* !lint && !__lint */
 
 #ifdef __ELF__
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S
index 180f8e366060..520f5b6dab24 100644
--- a/module/icp/asm-x86_64/sha2/sha512_impl.S
+++ b/module/icp/asm-x86_64/sha2/sha512_impl.S
@@ -1,2116 +1,2116 @@
 /*
  * ====================================================================
  * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  * project. Rights for redistribution and usage in source and binary
  * forms are granted according to the OpenSSL license.
  * ====================================================================
  *
  * sha256/512_block procedure for x86_64.
  *
  * 40% improvement over compiler-generated code on Opteron. On EM64T
  * sha256 was observed to run >80% faster and sha512 - >40%. No magical
  * tricks, just straight implementation... I really wonder why gcc
  * [being armed with inline assembler] fails to generate as fast code.
  * The only thing which is cool about this module is that it's very
  * same instruction sequence used for both SHA-256 and SHA-512. In
  * former case the instructions operate on 32-bit operands, while in
  * latter - on 64-bit ones. All I had to do is to get one flavor right,
  * the other one passed the test right away:-)
  *
  * sha256_block runs in ~1005 cycles on Opteron, which gives you
  * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
  * frequency in GHz. sha512_block runs in ~1275 cycles, which results
  * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
  * Well, if you compare it to IA-64 implementation, which maintains
  * X[16] in register bank[!], tends to 4 instructions per CPU clock
  * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
  * issue Opteron pipeline and X[16] maintained in memory. So that *if*
  * there is a way to improve it, *then* the only way would be to try to
  * offload X[16] updates to SSE unit, but that would require "deeper"
  * loop unroll, which in turn would naturally cause size blow-up, not
  * to mention increased complexity! And once again, only *if* it's
  * actually possible to noticeably improve overall ILP, instruction
  * level parallelism, on a given CPU implementation in this case.
  *
  * Special note on Intel EM64T. While Opteron CPU exhibits perfect
  * performance ratio of 1.5 between 64- and 32-bit flavors [see above],
  * [currently available] EM64T CPUs apparently are far from it. On the
  * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
  * sha256_block:-( This is presumably because 64-bit shifts/rotates
  * apparently are not atomic instructions, but implemented in microcode.
  */
 
 /*
  * OpenSolaris OS modifications
  *
  * Sun elects to use this software under the BSD license.
  *
  * This source originates from OpenSSL file sha512-x86_64.pl at
  * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
  * (presumably for future OpenSSL release 0.9.8h), with these changes:
  *
  * 1. Added perl "use strict" and declared variables.
  *
  * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
  * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
  *
  * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
  * assemblers).  Replaced the .picmeup macro with assembler code.
  *
  * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
  * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
  */
 
 /*
  * This file was generated by a perl script (sha512-x86_64.pl) that were
  * used to generate sha256 and sha512 variants from the same code base.
  * The comments from the original file have been pasted above.
  */
 
 
 #if defined(lint) || defined(__lint)
 #include <sys/stdint.h>
 #include <sha2/sha2.h>
 
 void
 SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
 {
 	(void) ctx, (void) in, (void) num;
 }
 
 
 #else
 #define _ASM
 #include <sys/asm_linkage.h>
 
 ENTRY_NP(SHA512TransformBlocks)
 .cfi_startproc
 	ENDBR
 	movq	%rsp, %rax
 .cfi_def_cfa_register %rax
 	push	%rbx
 .cfi_offset	%rbx,-16
 	push	%rbp
 .cfi_offset	%rbp,-24
 	push	%r12
 .cfi_offset	%r12,-32
 	push	%r13
 .cfi_offset	%r13,-40
 	push	%r14
 .cfi_offset	%r14,-48
 	push	%r15
 .cfi_offset	%r15,-56
 	mov	%rsp,%rbp		# copy %rsp
 	shl	$4,%rdx		# num*16
 	sub	$16*8+4*8,%rsp
 	lea	(%rsi,%rdx,8),%rdx	# inp+num*16*8
 	and	$-64,%rsp		# align stack frame
 	add	$8,%rdi		# Skip OpenSolaris field, "algotype"
 	mov	%rdi,16*8+0*8(%rsp)		# save ctx, 1st arg
 	mov	%rsi,16*8+1*8(%rsp)		# save inp, 2nd arg
 	mov	%rdx,16*8+2*8(%rsp)		# save end pointer, "3rd" arg
 	mov	%rbp,16*8+3*8(%rsp)		# save copy of %rsp
 # echo ".cfi_cfa_expression %rsp+152,deref,+56" |
 #	openssl/crypto/perlasm/x86_64-xlate.pl
 .cfi_escape	0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x38
 
 	#.picmeup %rbp
 	# The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
 	# the address of the "next" instruction into the target register
 	# (%rbp).  This generates these 2 instructions:
 	lea	.Llea(%rip),%rbp
 	#nop	# .picmeup generates a nop for mod 8 alignment--not needed here
 
 .Llea:
 	lea	K512-.(%rbp),%rbp
 
 	mov	8*0(%rdi),%rax
 	mov	8*1(%rdi),%rbx
 	mov	8*2(%rdi),%rcx
 	mov	8*3(%rdi),%rdx
 	mov	8*4(%rdi),%r8
 	mov	8*5(%rdi),%r9
 	mov	8*6(%rdi),%r10
 	mov	8*7(%rdi),%r11
 	jmp	.Lloop
 
-.align	16
+.balign	16
 .Lloop:
 	xor	%rdi,%rdi
 	mov	8*0(%rsi),%r12
 	bswap	%r12
 	mov	%r8,%r13
 	mov	%r8,%r14
 	mov	%r9,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r10,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r8,%r15			# (f^g)&e
 	mov	%r12,0(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r11,%r12			# T1+=h
 
 	mov	%rax,%r11
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rax,%r13
 	mov	%rax,%r14
 
 	ror	$28,%r11
 	ror	$34,%r13
 	mov	%rax,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r11
 	ror	$5,%r13
 	or	%rcx,%r14			# a|c
 
 	xor	%r13,%r11			# h=Sigma0(a)
 	and	%rcx,%r15			# a&c
 	add	%r12,%rdx			# d+=T1
 
 	and	%rbx,%r14			# (a|c)&b
 	add	%r12,%r11			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r11			# h+=Maj(a,b,c)
 	mov	8*1(%rsi),%r12
 	bswap	%r12
 	mov	%rdx,%r13
 	mov	%rdx,%r14
 	mov	%r8,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r9,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rdx,%r15			# (f^g)&e
 	mov	%r12,8(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r10,%r12			# T1+=h
 
 	mov	%r11,%r10
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r11,%r13
 	mov	%r11,%r14
 
 	ror	$28,%r10
 	ror	$34,%r13
 	mov	%r11,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r10
 	ror	$5,%r13
 	or	%rbx,%r14			# a|c
 
 	xor	%r13,%r10			# h=Sigma0(a)
 	and	%rbx,%r15			# a&c
 	add	%r12,%rcx			# d+=T1
 
 	and	%rax,%r14			# (a|c)&b
 	add	%r12,%r10			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r10			# h+=Maj(a,b,c)
 	mov	8*2(%rsi),%r12
 	bswap	%r12
 	mov	%rcx,%r13
 	mov	%rcx,%r14
 	mov	%rdx,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r8,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rcx,%r15			# (f^g)&e
 	mov	%r12,16(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r9,%r12			# T1+=h
 
 	mov	%r10,%r9
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r10,%r13
 	mov	%r10,%r14
 
 	ror	$28,%r9
 	ror	$34,%r13
 	mov	%r10,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r9
 	ror	$5,%r13
 	or	%rax,%r14			# a|c
 
 	xor	%r13,%r9			# h=Sigma0(a)
 	and	%rax,%r15			# a&c
 	add	%r12,%rbx			# d+=T1
 
 	and	%r11,%r14			# (a|c)&b
 	add	%r12,%r9			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r9			# h+=Maj(a,b,c)
 	mov	8*3(%rsi),%r12
 	bswap	%r12
 	mov	%rbx,%r13
 	mov	%rbx,%r14
 	mov	%rcx,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rdx,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rbx,%r15			# (f^g)&e
 	mov	%r12,24(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r8,%r12			# T1+=h
 
 	mov	%r9,%r8
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r9,%r13
 	mov	%r9,%r14
 
 	ror	$28,%r8
 	ror	$34,%r13
 	mov	%r9,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r8
 	ror	$5,%r13
 	or	%r11,%r14			# a|c
 
 	xor	%r13,%r8			# h=Sigma0(a)
 	and	%r11,%r15			# a&c
 	add	%r12,%rax			# d+=T1
 
 	and	%r10,%r14			# (a|c)&b
 	add	%r12,%r8			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r8			# h+=Maj(a,b,c)
 	mov	8*4(%rsi),%r12
 	bswap	%r12
 	mov	%rax,%r13
 	mov	%rax,%r14
 	mov	%rbx,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rcx,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rax,%r15			# (f^g)&e
 	mov	%r12,32(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rdx,%r12			# T1+=h
 
 	mov	%r8,%rdx
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r8,%r13
 	mov	%r8,%r14
 
 	ror	$28,%rdx
 	ror	$34,%r13
 	mov	%r8,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rdx
 	ror	$5,%r13
 	or	%r10,%r14			# a|c
 
 	xor	%r13,%rdx			# h=Sigma0(a)
 	and	%r10,%r15			# a&c
 	add	%r12,%r11			# d+=T1
 
 	and	%r9,%r14			# (a|c)&b
 	add	%r12,%rdx			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rdx			# h+=Maj(a,b,c)
 	mov	8*5(%rsi),%r12
 	bswap	%r12
 	mov	%r11,%r13
 	mov	%r11,%r14
 	mov	%rax,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rbx,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r11,%r15			# (f^g)&e
 	mov	%r12,40(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rcx,%r12			# T1+=h
 
 	mov	%rdx,%rcx
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rdx,%r13
 	mov	%rdx,%r14
 
 	ror	$28,%rcx
 	ror	$34,%r13
 	mov	%rdx,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rcx
 	ror	$5,%r13
 	or	%r9,%r14			# a|c
 
 	xor	%r13,%rcx			# h=Sigma0(a)
 	and	%r9,%r15			# a&c
 	add	%r12,%r10			# d+=T1
 
 	and	%r8,%r14			# (a|c)&b
 	add	%r12,%rcx			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rcx			# h+=Maj(a,b,c)
 	mov	8*6(%rsi),%r12
 	bswap	%r12
 	mov	%r10,%r13
 	mov	%r10,%r14
 	mov	%r11,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rax,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r10,%r15			# (f^g)&e
 	mov	%r12,48(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rbx,%r12			# T1+=h
 
 	mov	%rcx,%rbx
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rcx,%r13
 	mov	%rcx,%r14
 
 	ror	$28,%rbx
 	ror	$34,%r13
 	mov	%rcx,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rbx
 	ror	$5,%r13
 	or	%r8,%r14			# a|c
 
 	xor	%r13,%rbx			# h=Sigma0(a)
 	and	%r8,%r15			# a&c
 	add	%r12,%r9			# d+=T1
 
 	and	%rdx,%r14			# (a|c)&b
 	add	%r12,%rbx			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rbx			# h+=Maj(a,b,c)
 	mov	8*7(%rsi),%r12
 	bswap	%r12
 	mov	%r9,%r13
 	mov	%r9,%r14
 	mov	%r10,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r11,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r9,%r15			# (f^g)&e
 	mov	%r12,56(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rax,%r12			# T1+=h
 
 	mov	%rbx,%rax
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rbx,%r13
 	mov	%rbx,%r14
 
 	ror	$28,%rax
 	ror	$34,%r13
 	mov	%rbx,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rax
 	ror	$5,%r13
 	or	%rdx,%r14			# a|c
 
 	xor	%r13,%rax			# h=Sigma0(a)
 	and	%rdx,%r15			# a&c
 	add	%r12,%r8			# d+=T1
 
 	and	%rcx,%r14			# (a|c)&b
 	add	%r12,%rax			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rax			# h+=Maj(a,b,c)
 	mov	8*8(%rsi),%r12
 	bswap	%r12
 	mov	%r8,%r13
 	mov	%r8,%r14
 	mov	%r9,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r10,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r8,%r15			# (f^g)&e
 	mov	%r12,64(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r11,%r12			# T1+=h
 
 	mov	%rax,%r11
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rax,%r13
 	mov	%rax,%r14
 
 	ror	$28,%r11
 	ror	$34,%r13
 	mov	%rax,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r11
 	ror	$5,%r13
 	or	%rcx,%r14			# a|c
 
 	xor	%r13,%r11			# h=Sigma0(a)
 	and	%rcx,%r15			# a&c
 	add	%r12,%rdx			# d+=T1
 
 	and	%rbx,%r14			# (a|c)&b
 	add	%r12,%r11			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r11			# h+=Maj(a,b,c)
 	mov	8*9(%rsi),%r12
 	bswap	%r12
 	mov	%rdx,%r13
 	mov	%rdx,%r14
 	mov	%r8,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r9,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rdx,%r15			# (f^g)&e
 	mov	%r12,72(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r10,%r12			# T1+=h
 
 	mov	%r11,%r10
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r11,%r13
 	mov	%r11,%r14
 
 	ror	$28,%r10
 	ror	$34,%r13
 	mov	%r11,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r10
 	ror	$5,%r13
 	or	%rbx,%r14			# a|c
 
 	xor	%r13,%r10			# h=Sigma0(a)
 	and	%rbx,%r15			# a&c
 	add	%r12,%rcx			# d+=T1
 
 	and	%rax,%r14			# (a|c)&b
 	add	%r12,%r10			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r10			# h+=Maj(a,b,c)
 	mov	8*10(%rsi),%r12
 	bswap	%r12
 	mov	%rcx,%r13
 	mov	%rcx,%r14
 	mov	%rdx,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r8,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rcx,%r15			# (f^g)&e
 	mov	%r12,80(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r9,%r12			# T1+=h
 
 	mov	%r10,%r9
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r10,%r13
 	mov	%r10,%r14
 
 	ror	$28,%r9
 	ror	$34,%r13
 	mov	%r10,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r9
 	ror	$5,%r13
 	or	%rax,%r14			# a|c
 
 	xor	%r13,%r9			# h=Sigma0(a)
 	and	%rax,%r15			# a&c
 	add	%r12,%rbx			# d+=T1
 
 	and	%r11,%r14			# (a|c)&b
 	add	%r12,%r9			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r9			# h+=Maj(a,b,c)
 	mov	8*11(%rsi),%r12
 	bswap	%r12
 	mov	%rbx,%r13
 	mov	%rbx,%r14
 	mov	%rcx,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rdx,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rbx,%r15			# (f^g)&e
 	mov	%r12,88(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r8,%r12			# T1+=h
 
 	mov	%r9,%r8
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r9,%r13
 	mov	%r9,%r14
 
 	ror	$28,%r8
 	ror	$34,%r13
 	mov	%r9,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r8
 	ror	$5,%r13
 	or	%r11,%r14			# a|c
 
 	xor	%r13,%r8			# h=Sigma0(a)
 	and	%r11,%r15			# a&c
 	add	%r12,%rax			# d+=T1
 
 	and	%r10,%r14			# (a|c)&b
 	add	%r12,%r8			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r8			# h+=Maj(a,b,c)
 	mov	8*12(%rsi),%r12
 	bswap	%r12
 	mov	%rax,%r13
 	mov	%rax,%r14
 	mov	%rbx,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rcx,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rax,%r15			# (f^g)&e
 	mov	%r12,96(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rdx,%r12			# T1+=h
 
 	mov	%r8,%rdx
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r8,%r13
 	mov	%r8,%r14
 
 	ror	$28,%rdx
 	ror	$34,%r13
 	mov	%r8,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rdx
 	ror	$5,%r13
 	or	%r10,%r14			# a|c
 
 	xor	%r13,%rdx			# h=Sigma0(a)
 	and	%r10,%r15			# a&c
 	add	%r12,%r11			# d+=T1
 
 	and	%r9,%r14			# (a|c)&b
 	add	%r12,%rdx			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rdx			# h+=Maj(a,b,c)
 	mov	8*13(%rsi),%r12
 	bswap	%r12
 	mov	%r11,%r13
 	mov	%r11,%r14
 	mov	%rax,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rbx,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r11,%r15			# (f^g)&e
 	mov	%r12,104(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rcx,%r12			# T1+=h
 
 	mov	%rdx,%rcx
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rdx,%r13
 	mov	%rdx,%r14
 
 	ror	$28,%rcx
 	ror	$34,%r13
 	mov	%rdx,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rcx
 	ror	$5,%r13
 	or	%r9,%r14			# a|c
 
 	xor	%r13,%rcx			# h=Sigma0(a)
 	and	%r9,%r15			# a&c
 	add	%r12,%r10			# d+=T1
 
 	and	%r8,%r14			# (a|c)&b
 	add	%r12,%rcx			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rcx			# h+=Maj(a,b,c)
 	mov	8*14(%rsi),%r12
 	bswap	%r12
 	mov	%r10,%r13
 	mov	%r10,%r14
 	mov	%r11,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rax,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r10,%r15			# (f^g)&e
 	mov	%r12,112(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rbx,%r12			# T1+=h
 
 	mov	%rcx,%rbx
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rcx,%r13
 	mov	%rcx,%r14
 
 	ror	$28,%rbx
 	ror	$34,%r13
 	mov	%rcx,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rbx
 	ror	$5,%r13
 	or	%r8,%r14			# a|c
 
 	xor	%r13,%rbx			# h=Sigma0(a)
 	and	%r8,%r15			# a&c
 	add	%r12,%r9			# d+=T1
 
 	and	%rdx,%r14			# (a|c)&b
 	add	%r12,%rbx			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rbx			# h+=Maj(a,b,c)
 	mov	8*15(%rsi),%r12
 	bswap	%r12
 	mov	%r9,%r13
 	mov	%r9,%r14
 	mov	%r10,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r11,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r9,%r15			# (f^g)&e
 	mov	%r12,120(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rax,%r12			# T1+=h
 
 	mov	%rbx,%rax
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rbx,%r13
 	mov	%rbx,%r14
 
 	ror	$28,%rax
 	ror	$34,%r13
 	mov	%rbx,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rax
 	ror	$5,%r13
 	or	%rdx,%r14			# a|c
 
 	xor	%r13,%rax			# h=Sigma0(a)
 	and	%rdx,%r15			# a&c
 	add	%r12,%r8			# d+=T1
 
 	and	%rcx,%r14			# (a|c)&b
 	add	%r12,%rax			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rax			# h+=Maj(a,b,c)
 	jmp	.Lrounds_16_xx
-.align	16
+.balign	16
 .Lrounds_16_xx:
 	mov	8(%rsp),%r13
 	mov	112(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	72(%rsp),%r12
 
 	add	0(%rsp),%r12
 	mov	%r8,%r13
 	mov	%r8,%r14
 	mov	%r9,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r10,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r8,%r15			# (f^g)&e
 	mov	%r12,0(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r11,%r12			# T1+=h
 
 	mov	%rax,%r11
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rax,%r13
 	mov	%rax,%r14
 
 	ror	$28,%r11
 	ror	$34,%r13
 	mov	%rax,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r11
 	ror	$5,%r13
 	or	%rcx,%r14			# a|c
 
 	xor	%r13,%r11			# h=Sigma0(a)
 	and	%rcx,%r15			# a&c
 	add	%r12,%rdx			# d+=T1
 
 	and	%rbx,%r14			# (a|c)&b
 	add	%r12,%r11			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r11			# h+=Maj(a,b,c)
 	mov	16(%rsp),%r13
 	mov	120(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	80(%rsp),%r12
 
 	add	8(%rsp),%r12
 	mov	%rdx,%r13
 	mov	%rdx,%r14
 	mov	%r8,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r9,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rdx,%r15			# (f^g)&e
 	mov	%r12,8(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r10,%r12			# T1+=h
 
 	mov	%r11,%r10
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r11,%r13
 	mov	%r11,%r14
 
 	ror	$28,%r10
 	ror	$34,%r13
 	mov	%r11,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r10
 	ror	$5,%r13
 	or	%rbx,%r14			# a|c
 
 	xor	%r13,%r10			# h=Sigma0(a)
 	and	%rbx,%r15			# a&c
 	add	%r12,%rcx			# d+=T1
 
 	and	%rax,%r14			# (a|c)&b
 	add	%r12,%r10			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r10			# h+=Maj(a,b,c)
 	mov	24(%rsp),%r13
 	mov	0(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	88(%rsp),%r12
 
 	add	16(%rsp),%r12
 	mov	%rcx,%r13
 	mov	%rcx,%r14
 	mov	%rdx,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r8,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rcx,%r15			# (f^g)&e
 	mov	%r12,16(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r9,%r12			# T1+=h
 
 	mov	%r10,%r9
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r10,%r13
 	mov	%r10,%r14
 
 	ror	$28,%r9
 	ror	$34,%r13
 	mov	%r10,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r9
 	ror	$5,%r13
 	or	%rax,%r14			# a|c
 
 	xor	%r13,%r9			# h=Sigma0(a)
 	and	%rax,%r15			# a&c
 	add	%r12,%rbx			# d+=T1
 
 	and	%r11,%r14			# (a|c)&b
 	add	%r12,%r9			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r9			# h+=Maj(a,b,c)
 	mov	32(%rsp),%r13
 	mov	8(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	96(%rsp),%r12
 
 	add	24(%rsp),%r12
 	mov	%rbx,%r13
 	mov	%rbx,%r14
 	mov	%rcx,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rdx,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rbx,%r15			# (f^g)&e
 	mov	%r12,24(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r8,%r12			# T1+=h
 
 	mov	%r9,%r8
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r9,%r13
 	mov	%r9,%r14
 
 	ror	$28,%r8
 	ror	$34,%r13
 	mov	%r9,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r8
 	ror	$5,%r13
 	or	%r11,%r14			# a|c
 
 	xor	%r13,%r8			# h=Sigma0(a)
 	and	%r11,%r15			# a&c
 	add	%r12,%rax			# d+=T1
 
 	and	%r10,%r14			# (a|c)&b
 	add	%r12,%r8			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r8			# h+=Maj(a,b,c)
 	mov	40(%rsp),%r13
 	mov	16(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	104(%rsp),%r12
 
 	add	32(%rsp),%r12
 	mov	%rax,%r13
 	mov	%rax,%r14
 	mov	%rbx,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rcx,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rax,%r15			# (f^g)&e
 	mov	%r12,32(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rdx,%r12			# T1+=h
 
 	mov	%r8,%rdx
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r8,%r13
 	mov	%r8,%r14
 
 	ror	$28,%rdx
 	ror	$34,%r13
 	mov	%r8,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rdx
 	ror	$5,%r13
 	or	%r10,%r14			# a|c
 
 	xor	%r13,%rdx			# h=Sigma0(a)
 	and	%r10,%r15			# a&c
 	add	%r12,%r11			# d+=T1
 
 	and	%r9,%r14			# (a|c)&b
 	add	%r12,%rdx			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rdx			# h+=Maj(a,b,c)
 	mov	48(%rsp),%r13
 	mov	24(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	112(%rsp),%r12
 
 	add	40(%rsp),%r12
 	mov	%r11,%r13
 	mov	%r11,%r14
 	mov	%rax,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rbx,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r11,%r15			# (f^g)&e
 	mov	%r12,40(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rcx,%r12			# T1+=h
 
 	mov	%rdx,%rcx
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rdx,%r13
 	mov	%rdx,%r14
 
 	ror	$28,%rcx
 	ror	$34,%r13
 	mov	%rdx,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rcx
 	ror	$5,%r13
 	or	%r9,%r14			# a|c
 
 	xor	%r13,%rcx			# h=Sigma0(a)
 	and	%r9,%r15			# a&c
 	add	%r12,%r10			# d+=T1
 
 	and	%r8,%r14			# (a|c)&b
 	add	%r12,%rcx			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rcx			# h+=Maj(a,b,c)
 	mov	56(%rsp),%r13
 	mov	32(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	120(%rsp),%r12
 
 	add	48(%rsp),%r12
 	mov	%r10,%r13
 	mov	%r10,%r14
 	mov	%r11,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rax,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r10,%r15			# (f^g)&e
 	mov	%r12,48(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rbx,%r12			# T1+=h
 
 	mov	%rcx,%rbx
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rcx,%r13
 	mov	%rcx,%r14
 
 	ror	$28,%rbx
 	ror	$34,%r13
 	mov	%rcx,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rbx
 	ror	$5,%r13
 	or	%r8,%r14			# a|c
 
 	xor	%r13,%rbx			# h=Sigma0(a)
 	and	%r8,%r15			# a&c
 	add	%r12,%r9			# d+=T1
 
 	and	%rdx,%r14			# (a|c)&b
 	add	%r12,%rbx			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rbx			# h+=Maj(a,b,c)
 	mov	64(%rsp),%r13
 	mov	40(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	0(%rsp),%r12
 
 	add	56(%rsp),%r12
 	mov	%r9,%r13
 	mov	%r9,%r14
 	mov	%r10,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r11,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r9,%r15			# (f^g)&e
 	mov	%r12,56(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rax,%r12			# T1+=h
 
 	mov	%rbx,%rax
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rbx,%r13
 	mov	%rbx,%r14
 
 	ror	$28,%rax
 	ror	$34,%r13
 	mov	%rbx,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rax
 	ror	$5,%r13
 	or	%rdx,%r14			# a|c
 
 	xor	%r13,%rax			# h=Sigma0(a)
 	and	%rdx,%r15			# a&c
 	add	%r12,%r8			# d+=T1
 
 	and	%rcx,%r14			# (a|c)&b
 	add	%r12,%rax			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rax			# h+=Maj(a,b,c)
 	mov	72(%rsp),%r13
 	mov	48(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	8(%rsp),%r12
 
 	add	64(%rsp),%r12
 	mov	%r8,%r13
 	mov	%r8,%r14
 	mov	%r9,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r10,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r8,%r15			# (f^g)&e
 	mov	%r12,64(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r11,%r12			# T1+=h
 
 	mov	%rax,%r11
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rax,%r13
 	mov	%rax,%r14
 
 	ror	$28,%r11
 	ror	$34,%r13
 	mov	%rax,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r11
 	ror	$5,%r13
 	or	%rcx,%r14			# a|c
 
 	xor	%r13,%r11			# h=Sigma0(a)
 	and	%rcx,%r15			# a&c
 	add	%r12,%rdx			# d+=T1
 
 	and	%rbx,%r14			# (a|c)&b
 	add	%r12,%r11			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r11			# h+=Maj(a,b,c)
 	mov	80(%rsp),%r13
 	mov	56(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	16(%rsp),%r12
 
 	add	72(%rsp),%r12
 	mov	%rdx,%r13
 	mov	%rdx,%r14
 	mov	%r8,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r9,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rdx,%r15			# (f^g)&e
 	mov	%r12,72(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r10,%r12			# T1+=h
 
 	mov	%r11,%r10
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r11,%r13
 	mov	%r11,%r14
 
 	ror	$28,%r10
 	ror	$34,%r13
 	mov	%r11,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r10
 	ror	$5,%r13
 	or	%rbx,%r14			# a|c
 
 	xor	%r13,%r10			# h=Sigma0(a)
 	and	%rbx,%r15			# a&c
 	add	%r12,%rcx			# d+=T1
 
 	and	%rax,%r14			# (a|c)&b
 	add	%r12,%r10			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r10			# h+=Maj(a,b,c)
 	mov	88(%rsp),%r13
 	mov	64(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	24(%rsp),%r12
 
 	add	80(%rsp),%r12
 	mov	%rcx,%r13
 	mov	%rcx,%r14
 	mov	%rdx,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r8,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rcx,%r15			# (f^g)&e
 	mov	%r12,80(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r9,%r12			# T1+=h
 
 	mov	%r10,%r9
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r10,%r13
 	mov	%r10,%r14
 
 	ror	$28,%r9
 	ror	$34,%r13
 	mov	%r10,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r9
 	ror	$5,%r13
 	or	%rax,%r14			# a|c
 
 	xor	%r13,%r9			# h=Sigma0(a)
 	and	%rax,%r15			# a&c
 	add	%r12,%rbx			# d+=T1
 
 	and	%r11,%r14			# (a|c)&b
 	add	%r12,%r9			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r9			# h+=Maj(a,b,c)
 	mov	96(%rsp),%r13
 	mov	72(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	32(%rsp),%r12
 
 	add	88(%rsp),%r12
 	mov	%rbx,%r13
 	mov	%rbx,%r14
 	mov	%rcx,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rdx,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rbx,%r15			# (f^g)&e
 	mov	%r12,88(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%r8,%r12			# T1+=h
 
 	mov	%r9,%r8
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r9,%r13
 	mov	%r9,%r14
 
 	ror	$28,%r8
 	ror	$34,%r13
 	mov	%r9,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%r8
 	ror	$5,%r13
 	or	%r11,%r14			# a|c
 
 	xor	%r13,%r8			# h=Sigma0(a)
 	and	%r11,%r15			# a&c
 	add	%r12,%rax			# d+=T1
 
 	and	%r10,%r14			# (a|c)&b
 	add	%r12,%r8			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%r8			# h+=Maj(a,b,c)
 	mov	104(%rsp),%r13
 	mov	80(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	40(%rsp),%r12
 
 	add	96(%rsp),%r12
 	mov	%rax,%r13
 	mov	%rax,%r14
 	mov	%rbx,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rcx,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%rax,%r15			# (f^g)&e
 	mov	%r12,96(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rdx,%r12			# T1+=h
 
 	mov	%r8,%rdx
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%r8,%r13
 	mov	%r8,%r14
 
 	ror	$28,%rdx
 	ror	$34,%r13
 	mov	%r8,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rdx
 	ror	$5,%r13
 	or	%r10,%r14			# a|c
 
 	xor	%r13,%rdx			# h=Sigma0(a)
 	and	%r10,%r15			# a&c
 	add	%r12,%r11			# d+=T1
 
 	and	%r9,%r14			# (a|c)&b
 	add	%r12,%rdx			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rdx			# h+=Maj(a,b,c)
 	mov	112(%rsp),%r13
 	mov	88(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	48(%rsp),%r12
 
 	add	104(%rsp),%r12
 	mov	%r11,%r13
 	mov	%r11,%r14
 	mov	%rax,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rbx,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r11,%r15			# (f^g)&e
 	mov	%r12,104(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rcx,%r12			# T1+=h
 
 	mov	%rdx,%rcx
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rdx,%r13
 	mov	%rdx,%r14
 
 	ror	$28,%rcx
 	ror	$34,%r13
 	mov	%rdx,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rcx
 	ror	$5,%r13
 	or	%r9,%r14			# a|c
 
 	xor	%r13,%rcx			# h=Sigma0(a)
 	and	%r9,%r15			# a&c
 	add	%r12,%r10			# d+=T1
 
 	and	%r8,%r14			# (a|c)&b
 	add	%r12,%rcx			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rcx			# h+=Maj(a,b,c)
 	mov	120(%rsp),%r13
 	mov	96(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	56(%rsp),%r12
 
 	add	112(%rsp),%r12
 	mov	%r10,%r13
 	mov	%r10,%r14
 	mov	%r11,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%rax,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r10,%r15			# (f^g)&e
 	mov	%r12,112(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rbx,%r12			# T1+=h
 
 	mov	%rcx,%rbx
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rcx,%r13
 	mov	%rcx,%r14
 
 	ror	$28,%rbx
 	ror	$34,%r13
 	mov	%rcx,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rbx
 	ror	$5,%r13
 	or	%r8,%r14			# a|c
 
 	xor	%r13,%rbx			# h=Sigma0(a)
 	and	%r8,%r15			# a&c
 	add	%r12,%r9			# d+=T1
 
 	and	%rdx,%r14			# (a|c)&b
 	add	%r12,%rbx			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rbx			# h+=Maj(a,b,c)
 	mov	0(%rsp),%r13
 	mov	104(%rsp),%r12
 
 	mov	%r13,%r15
 
 	shr	$7,%r13
 	ror	$1,%r15
 
 	xor	%r15,%r13
 	ror	$7,%r15
 
 	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
 	mov	%r12,%r14
 
 	shr	$6,%r12
 	ror	$19,%r14
 
 	xor	%r14,%r12
 	ror	$42,%r14
 
 	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
 
 	add	%r13,%r12
 
 	add	64(%rsp),%r12
 
 	add	120(%rsp),%r12
 	mov	%r9,%r13
 	mov	%r9,%r14
 	mov	%r10,%r15
 
 	ror	$14,%r13
 	ror	$18,%r14
 	xor	%r11,%r15			# f^g
 
 	xor	%r14,%r13
 	ror	$23,%r14
 	and	%r9,%r15			# (f^g)&e
 	mov	%r12,120(%rsp)
 
 	xor	%r14,%r13			# Sigma1(e)
 	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
 	add	%rax,%r12			# T1+=h
 
 	mov	%rbx,%rax
 	add	%r13,%r12			# T1+=Sigma1(e)
 
 	add	%r15,%r12			# T1+=Ch(e,f,g)
 	mov	%rbx,%r13
 	mov	%rbx,%r14
 
 	ror	$28,%rax
 	ror	$34,%r13
 	mov	%rbx,%r15
 	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
 
 	xor	%r13,%rax
 	ror	$5,%r13
 	or	%rdx,%r14			# a|c
 
 	xor	%r13,%rax			# h=Sigma0(a)
 	and	%rdx,%r15			# a&c
 	add	%r12,%r8			# d+=T1
 
 	and	%rcx,%r14			# (a|c)&b
 	add	%r12,%rax			# h+=T1
 
 	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1(%rdi),%rdi	# round++
 
 	add	%r14,%rax			# h+=Maj(a,b,c)
 	cmp	$80,%rdi
 	jb	.Lrounds_16_xx
 
 	mov	16*8+0*8(%rsp),%rdi
 	lea	16*8(%rsi),%rsi
 
 	add	8*0(%rdi),%rax
 	add	8*1(%rdi),%rbx
 	add	8*2(%rdi),%rcx
 	add	8*3(%rdi),%rdx
 	add	8*4(%rdi),%r8
 	add	8*5(%rdi),%r9
 	add	8*6(%rdi),%r10
 	add	8*7(%rdi),%r11
 
 	cmp	16*8+2*8(%rsp),%rsi
 
 	mov	%rax,8*0(%rdi)
 	mov	%rbx,8*1(%rdi)
 	mov	%rcx,8*2(%rdi)
 	mov	%rdx,8*3(%rdi)
 	mov	%r8,8*4(%rdi)
 	mov	%r9,8*5(%rdi)
 	mov	%r10,8*6(%rdi)
 	mov	%r11,8*7(%rdi)
 	jb	.Lloop
 
 	mov	16*8+3*8(%rsp),%rsp
 .cfi_def_cfa	%rsp,56
 	pop	%r15
 .cfi_adjust_cfa_offset -8
 .cfi_restore	%r15
 	pop	%r14
 .cfi_adjust_cfa_offset -8
 .cfi_restore	%r14
 	pop	%r13
 .cfi_adjust_cfa_offset -8
 .cfi_restore	%r13
 	pop	%r12
 .cfi_adjust_cfa_offset -8
 .cfi_restore	%r12
 	pop	%rbp
 .cfi_adjust_cfa_offset -8
 .cfi_restore	%rbp
 	pop	%rbx
 .cfi_adjust_cfa_offset -8
 .cfi_restore	%rbx
 
 	RET
 .cfi_endproc
 SET_SIZE(SHA512TransformBlocks)
 
 .section .rodata
-.align	64
+.balign	64
 SET_OBJ(K512)
 K512:
 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
 	.quad	0xd192e819d6ef5218,0xd69906245565a910
 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
 	.quad	0x28db77f523047d84,0x32caab7b40c72493
 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
 #endif /* !lint && !__lint */
 
 #if defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
 
diff --git a/module/lua/setjmp/setjmp_aarch64.S b/module/lua/setjmp/setjmp_aarch64.S
index a5a9a85fd57e..040ef1821ab0 100644
--- a/module/lua/setjmp/setjmp_aarch64.S
+++ b/module/lua/setjmp/setjmp_aarch64.S
@@ -1,86 +1,86 @@
 /*-
  * Copyright (c) 2014 Andrew Turner
  * Copyright (c) 2014-2015 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Andrew Turner
  * under sponsorship from the FreeBSD Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 
 #ifdef __aarch64__
 
 #define	ENTRY(sym) \
 	.text; \
 	.globl	sym; \
-	.align	2; \
+	.balign	2; \
 	.type	sym,#function; \
 sym:
 
 #define	END(sym) \
 	.size	sym, . - sym
 
 
 ENTRY(setjmp)
 	/* Store the stack pointer */
 	mov	x8, sp
 	str	x8, [x0], #8
 
 	/* Store the general purpose registers and lr */
 	stp	x19, x20, [x0], #16
 	stp	x21, x22, [x0], #16
 	stp	x23, x24, [x0], #16
 	stp	x25, x26, [x0], #16
 	stp	x27, x28, [x0], #16
 	stp	x29, x30, [x0], #16
 
 	/* Return value */
 	mov	x0, #0
 	ret
 END(setjmp)
 
 ENTRY(longjmp)
 	/* Restore the stack pointer */
 	ldr	x8, [x0], #8
 	mov	sp, x8
 
 	/* Restore the general purpose registers and lr */
 	ldp	x19, x20, [x0], #16
 	ldp	x21, x22, [x0], #16
 	ldp	x23, x24, [x0], #16
 	ldp	x25, x26, [x0], #16
 	ldp	x27, x28, [x0], #16
 	ldp	x29, x30, [x0], #16
 
 	/* Load the return value */
 	mov	x0, x1
 	ret
 END(longjmp)
 
 #ifdef __ELF__
 .section .note.GNU-stack,"",%progbits
 #endif
 
 #endif /* __aarch64__ */
diff --git a/module/lua/setjmp/setjmp_arm.S b/module/lua/setjmp/setjmp_arm.S
index 78bc3e0b347d..0b18a96282cf 100644
--- a/module/lua/setjmp/setjmp_arm.S
+++ b/module/lua/setjmp/setjmp_arm.S
@@ -1,84 +1,84 @@
 /*-
  * Copyright 2004-2014 Olivier Houchard <cognet@FreeBSD.org>
  * Copyright 2012-2014 Ian Lepore <ian@FreeBSD.org>
  * Copyright 2013-2014 Andrew Turner <andrew@FreeBSD.org>
  * Copyright 2014 Svatopluk Kraus <onwahe@gmail.com>
  * Copyright 2014 Michal Meloun <meloun@miracle.cz>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 
 #if defined(__arm__) && !defined(__aarch64__)
 
 #if defined(__thumb2__)
 #define	_FUNC_MODE	.code 16; .thumb_func
 #else
 #define	_FUNC_MODE	.code 32
 #endif
 
 #define	ENTRY(x) \
 	.text; \
 	.syntax unified; \
-	.align 2; \
+	.balign 2; \
 	.global x; \
 	.type x,#function; \
 	_FUNC_MODE; \
 x:
 
 #define	END(x) \
 	.size x, . - x;
 
 #define	RET	bx	lr
 
 
 /*
  * setjump + longjmp
  */
 ENTRY(setjmp)
 #if defined(__thumb2__)
 	mov	ip, sp
 	stmia	r0, {r4-r12,r14}
 #else
 	stmia	r0, {r4-r14}
 #endif
 	mov	r0, #0x00000000
 	RET
 END(setjmp)
 
 ENTRY(longjmp)
 #if defined(__thumb2__)
 	ldmia	r0, {r4-r12,r14}
 	mov	sp, ip
 #else
 	ldmia	r0, {r4-r14}
 #endif
 	mov	r0, #0x00000001
 	RET
 END(longjmp)
 
 #ifdef __ELF__
 .section .note.GNU-stack,"",%progbits
 #endif
 
 #endif
diff --git a/module/lua/setjmp/setjmp_i386.S b/module/lua/setjmp/setjmp_i386.S
index 0d0adfc351ca..87f9cb08c292 100644
--- a/module/lua/setjmp/setjmp_i386.S
+++ b/module/lua/setjmp/setjmp_i386.S
@@ -1,69 +1,69 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #define	ENTRY(x) \
 	.text; \
-	.align	8; \
+	.balign	8; \
 	.globl	x; \
 	.type	x, @function; \
 x:
 
 #define	SET_SIZE(x) \
 	.size	x, [.-x]
 
 /*
  * Setjmp and longjmp implement non-local gotos using state vectors
  * type label_t.
  */
 #ifdef __i386__
 
 	ENTRY(setjmp)			/* save area is passed in eax */
 	movl	%ebp, 0(%eax)		/* save ebp */
 	movl	%ebx, 4(%eax)		/* save ebx */
 	movl	%esi, 8(%eax)		/* save esi */
 	movl	%edi, 12(%eax)		/* save edi */
 	movl	%esp, 16(%eax)		/* save esp */
 	movl	(%esp), %ecx		/* %eip (return address) */
 	movl	%ecx, 20(%eax)		/* save eip */
 	subl	%eax, %eax		/* return 0 */
 	ret
 	SET_SIZE(setjmp)
 
 	ENTRY(longjmp)			/* save area is passed in eax */
 	movl	0(%eax), %ebp		/* restore ebp */
 	movl	4(%eax), %ebx		/* restore ebx */
 	movl	8(%eax), %esi		/* restore esi */
 	movl	12(%eax), %edi		/* restore edi */
 	movl	16(%eax), %esp		/* restore esp */
 	movl	20(%eax), %ecx		/* %eip (return address) */
 	addl	$4, %esp		/* pop ret adr */
 	jmp	*%ecx			/* indirect jump */
 	SET_SIZE(longjmp)
 
 #ifdef __ELF__
 .section .note.GNU-stack,"",%progbits
 #endif
 
 #endif /* __i386__ */
diff --git a/module/lua/setjmp/setjmp_ppc.S b/module/lua/setjmp/setjmp_ppc.S
index 72aa5d5ab5b0..a035cd11b33b 100644
--- a/module/lua/setjmp/setjmp_ppc.S
+++ b/module/lua/setjmp/setjmp_ppc.S
@@ -1,165 +1,165 @@
 /*	$FreeBSD$  */
 /*	from:	NetBSD: setjmp.S,v 1.1 1998/01/27 15:13:12 sakamoto Exp $  */
 /*	from:	OpenBSD: setjmp.S,v 1.2 1996/12/28 06:22:18 rahnds Exp 	*/
 /* kernel version of this file, does not have signal goop */
 /* int setjmp(jmp_buf env) */
 
 #define	_ASM
 #include <asm/types.h>
 
 #ifdef __powerpc64__
 #if !defined(PPC64_ELF_ABI_v2) && !defined(PPC64_ELF_ABI_v1)
 #if defined(_CALL_ELF) && _CALL_ELF == 2
 #define	PPC64_ELF_ABI_v2
 #endif /* _CALL_ELF */
 #endif /* PPC64_ELF_ABI_ */
 #endif /* __powerpc64__ */
 
 #ifdef __powerpc64__
 #define LD_REG	ld
 #define	ST_REG	std
 #define	REGWIDTH 8
 #else
 #define	LD_REG	lwz
 #define	ST_REG	stw
 #define	REGWIDTH 4
 #endif /* __powerpc64__ */
 
 #define JMP_r1	1*REGWIDTH
 #define JMP_r2	2*REGWIDTH
 #define JMP_r14	3*REGWIDTH
 #define JMP_r15 4*REGWIDTH
 #define JMP_r16 5*REGWIDTH
 #define JMP_r17 6*REGWIDTH
 #define JMP_r18 7*REGWIDTH
 #define JMP_r19 8*REGWIDTH
 #define JMP_r20 9*REGWIDTH
 #define JMP_r21 10*REGWIDTH
 #define JMP_r22 11*REGWIDTH
 #define JMP_r23 12*REGWIDTH
 #define JMP_r24 13*REGWIDTH
 #define JMP_r25 14*REGWIDTH
 #define JMP_r26 15*REGWIDTH
 #define JMP_r27 16*REGWIDTH
 #define JMP_r28 17*REGWIDTH
 #define JMP_r29 18*REGWIDTH
 #define JMP_r30 19*REGWIDTH
 #define JMP_r31 20*REGWIDTH
 #define JMP_lr 	21*REGWIDTH
 #define JMP_cr	22*REGWIDTH
 #define JMP_ctr	23*REGWIDTH
 #define JMP_xer	24*REGWIDTH
 
 #ifdef __powerpc64__
 #ifdef PPC64_ELF_ABI_v2
 
 #define	ENTRY(name) \
-	.align 2 ; \
+	.balign 2 ; \
 	.type name,@function; \
 	.weak name; \
 name:
 
 #else /* PPC64_ELF_ABI_v1 */
 
 #define	XGLUE(a,b) a##b
 #define	GLUE(a,b) XGLUE(a,b)
 #define	ENTRY(name) \
-	.align 2 ; \
+	.balign 2 ; \
 	.weak name; \
 	.weak GLUE(.,name); \
 	.pushsection ".opd","aw"; \
 name: \
 	.quad GLUE(.,name); \
 	.quad .TOC.@tocbase; \
 	.quad 0; \
 	.popsection; \
 	.type GLUE(.,name),@function; \
 GLUE(.,name):
 
 #endif /* PPC64_ELF_ABI_v2 */
 
 #else /* 32-bit */
 
 #define	ENTRY(name) \
 	.text; \
 	.p2align 4; \
 	.weak name; \
 	.type name,@function; \
 name:
 
 #endif /* __powerpc64__ */
 
 
 ENTRY(setjmp)
 	ST_REG 31, JMP_r31(3)
 	/* r1, r2, r14-r30 */
 	ST_REG 1,  JMP_r1 (3)
 	ST_REG 2,  JMP_r2 (3)
 	ST_REG 14, JMP_r14(3)
 	ST_REG 15, JMP_r15(3)
 	ST_REG 16, JMP_r16(3)
 	ST_REG 17, JMP_r17(3)
 	ST_REG 18, JMP_r18(3)
 	ST_REG 19, JMP_r19(3)
 	ST_REG 20, JMP_r20(3)
 	ST_REG 21, JMP_r21(3)
 	ST_REG 22, JMP_r22(3)
 	ST_REG 23, JMP_r23(3)
 	ST_REG 24, JMP_r24(3)
 	ST_REG 25, JMP_r25(3)
 	ST_REG 26, JMP_r26(3)
 	ST_REG 27, JMP_r27(3)
 	ST_REG 28, JMP_r28(3)
 	ST_REG 29, JMP_r29(3)
 	ST_REG 30, JMP_r30(3)
 	/* cr, lr, ctr, xer */
 	mfcr 0
 	ST_REG 0, JMP_cr(3)
 	mflr 0
 	ST_REG 0, JMP_lr(3)
 	mfctr 0
 	ST_REG 0, JMP_ctr(3)
 	mfxer 0
 	ST_REG 0, JMP_xer(3)
 	/* f14-f31, fpscr */
 	li 3, 0
 	blr
 
 ENTRY(longjmp)
 	LD_REG 31, JMP_r31(3)
 	/* r1, r2, r14-r30 */
 	LD_REG 1,  JMP_r1 (3)
 	LD_REG 2,  JMP_r2 (3)
 	LD_REG 14, JMP_r14(3)
 	LD_REG 15, JMP_r15(3)
 	LD_REG 16, JMP_r16(3)
 	LD_REG 17, JMP_r17(3)
 	LD_REG 18, JMP_r18(3)
 	LD_REG 19, JMP_r19(3)
 	LD_REG 20, JMP_r20(3)
 	LD_REG 21, JMP_r21(3)
 	LD_REG 22, JMP_r22(3)
 	LD_REG 23, JMP_r23(3)
 	LD_REG 24, JMP_r24(3)
 	LD_REG 25, JMP_r25(3)
 	LD_REG 26, JMP_r26(3)
 	LD_REG 27, JMP_r27(3)
 	LD_REG 28, JMP_r28(3)
 	LD_REG 29, JMP_r29(3)
 	LD_REG 30, JMP_r30(3)
 	/* cr, lr, ctr, xer */
 	LD_REG 0, JMP_cr(3)
 	mtcr 0
 	LD_REG 0, JMP_lr(3)
 	mtlr 0
 	LD_REG 0, JMP_ctr(3)
 	mtctr 0
 	LD_REG 0, JMP_xer(3)
 	mtxer 0
 	/* f14-f31, fpscr */
 	mr 3, 4
 	blr
 
 #ifdef __ELF__
 .section .note.GNU-stack,"",%progbits
 #endif
diff --git a/module/lua/setjmp/setjmp_sparc64.S b/module/lua/setjmp/setjmp_sparc64.S
index a37a71cbce33..e1099643de92 100644
--- a/module/lua/setjmp/setjmp_sparc64.S
+++ b/module/lua/setjmp/setjmp_sparc64.S
@@ -1,105 +1,105 @@
 /*
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This software was developed by the Computer Systems Engineering group
  * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
  * contributed to Berkeley.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$Header: _setjmp.s,v 1.1 91/07/06 16:45:53 torek Exp
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
 #if 0
 	.asciz "@(#)_setjmp.s	8.1 (Berkeley) 6/4/93"
 #else
 	RCSID("$NetBSD: _setjmp.S,v 1.4 1998/10/08 02:27:59 eeh Exp $")
 #endif
 #endif /* LIBC_SCCS and not lint */
 
 #define	_JB_FP		0x0
 #define	_JB_PC		0x8
 #define	_JB_SP		0x10
 
 	.register %g2,#ignore
 	.register %g3,#ignore
 
 #define	ENTRY(x)		\
 	.text ;			\
-	.align 32 ;		\
+	.balign 32 ;		\
 	.globl	x ;		\
 	.type	x,@function ;	\
 x:
 
 #define	END(x)			\
 	.size x, . - x
 
 /*
  * C library -- setjmp, longjmp
  *
  *	longjmp(a,v)
  * will generate a "return(v?v:1)" from
  * the last call to
  *	setjmp(a)
  * by restoring the previous context.
  */
 
 ENTRY(setjmp)
 	stx	%sp, [%o0 + _JB_SP]
 	stx	%o7, [%o0 + _JB_PC]
 	stx	%fp, [%o0 + _JB_FP]
 	retl
 	 clr	%o0
 END(setjmp)
 
 ENTRY(longjmp)
 	mov	1, %g1
 	movrnz	%o1, %o1, %g1
 	mov	%o0, %g2
 	ldx	[%g2 + _JB_FP], %g3
 1:	cmp	%fp, %g3
 	bl,a	1b
 	 restore
 	be,a	2f
 	 ldx	[%g2 + _JB_SP], %o0
 
 .Lbotch:
 	illtrap
 
 2:	cmp	%o0, %sp
 	bge,a	3f
 	 mov	%o0, %sp
 	b,a	.Lbotch
 	 nop
 3:	ldx	[%g2 + _JB_PC], %o7
 	retl
 	 mov	%g1, %o0
 END(longjmp)
 
 #ifdef __ELF__
 .section .note.GNU-stack,"",%progbits
 #endif