| /* | 
 |  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES | 
 |  * | 
 |  * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | 
 |  * | 
 |  * This program is free software; you can redistribute it and/or modify | 
 |  * it under the terms of the GNU General Public License version 2 as | 
 |  * published by the Free Software Foundation. | 
 |  */ | 
 |  | 
 | /* included by aes-ce.S and aes-neon.S */ | 
 |  | 
 | 	.text | 
 | 	.align		4 | 
 |  | 
 | /* | 
 |  * There are several ways to instantiate this code: | 
 |  * - no interleave, all inline | 
 |  * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2) | 
 |  * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE) | 
 |  * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4) | 
 |  * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE) | 
 |  * | 
 |  * Macros imported by this code: | 
 |  * - enc_prepare	- setup NEON registers for encryption | 
 |  * - dec_prepare	- setup NEON registers for decryption | 
 |  * - enc_switch_key	- change to new key after having prepared for encryption | 
 |  * - encrypt_block	- encrypt a single block | 
 |  * - decrypt block	- decrypt a single block | 
 |  * - encrypt_block2x	- encrypt 2 blocks in parallel (if INTERLEAVE == 2) | 
 |  * - decrypt_block2x	- decrypt 2 blocks in parallel (if INTERLEAVE == 2) | 
 |  * - encrypt_block4x	- encrypt 4 blocks in parallel (if INTERLEAVE == 4) | 
 |  * - decrypt_block4x	- decrypt 4 blocks in parallel (if INTERLEAVE == 4) | 
 |  */ | 
 |  | 
 | #if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE) | 
 | #define FRAME_PUSH	stp x29, x30, [sp,#-16]! ; mov x29, sp | 
 | #define FRAME_POP	ldp x29, x30, [sp],#16 | 
 |  | 
 | #if INTERLEAVE == 2 | 
 |  | 
 | aes_encrypt_block2x: | 
 | 	encrypt_block2x	v0, v1, w3, x2, x6, w7 | 
 | 	ret | 
 | ENDPROC(aes_encrypt_block2x) | 
 |  | 
 | aes_decrypt_block2x: | 
 | 	decrypt_block2x	v0, v1, w3, x2, x6, w7 | 
 | 	ret | 
 | ENDPROC(aes_decrypt_block2x) | 
 |  | 
 | #elif INTERLEAVE == 4 | 
 |  | 
 | aes_encrypt_block4x: | 
 | 	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7 | 
 | 	ret | 
 | ENDPROC(aes_encrypt_block4x) | 
 |  | 
 | aes_decrypt_block4x: | 
 | 	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7 | 
 | 	ret | 
 | ENDPROC(aes_decrypt_block4x) | 
 |  | 
 | #else | 
 | #error INTERLEAVE should equal 2 or 4 | 
 | #endif | 
 |  | 
 | 	.macro		do_encrypt_block2x | 
 | 	bl		aes_encrypt_block2x | 
 | 	.endm | 
 |  | 
 | 	.macro		do_decrypt_block2x | 
 | 	bl		aes_decrypt_block2x | 
 | 	.endm | 
 |  | 
 | 	.macro		do_encrypt_block4x | 
 | 	bl		aes_encrypt_block4x | 
 | 	.endm | 
 |  | 
 | 	.macro		do_decrypt_block4x | 
 | 	bl		aes_decrypt_block4x | 
 | 	.endm | 
 |  | 
 | #else | 
 | #define FRAME_PUSH | 
 | #define FRAME_POP | 
 |  | 
 | 	.macro		do_encrypt_block2x | 
 | 	encrypt_block2x	v0, v1, w3, x2, x6, w7 | 
 | 	.endm | 
 |  | 
 | 	.macro		do_decrypt_block2x | 
 | 	decrypt_block2x	v0, v1, w3, x2, x6, w7 | 
 | 	.endm | 
 |  | 
 | 	.macro		do_encrypt_block4x | 
 | 	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7 | 
 | 	.endm | 
 |  | 
 | 	.macro		do_decrypt_block4x | 
 | 	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7 | 
 | 	.endm | 
 |  | 
 | #endif | 
 |  | 
 | 	/* | 
 | 	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | 
 | 	 *		   int blocks, int first) | 
 | 	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | 
 | 	 *		   int blocks, int first) | 
 | 	 */ | 
 |  | 
 | AES_ENTRY(aes_ecb_encrypt) | 
 | 	FRAME_PUSH | 
 | 	cbz		w5, .LecbencloopNx | 
 |  | 
 | 	enc_prepare	w3, x2, x5 | 
 |  | 
 | .LecbencloopNx: | 
 | #if INTERLEAVE >= 2 | 
 | 	subs		w4, w4, #INTERLEAVE | 
 | 	bmi		.Lecbenc1x | 
 | #if INTERLEAVE == 2 | 
 | 	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */ | 
 | 	do_encrypt_block2x | 
 | 	st1		{v0.16b-v1.16b}, [x0], #32 | 
 | #else | 
 | 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */ | 
 | 	do_encrypt_block4x | 
 | 	st1		{v0.16b-v3.16b}, [x0], #64 | 
 | #endif | 
 | 	b		.LecbencloopNx | 
 | .Lecbenc1x: | 
 | 	adds		w4, w4, #INTERLEAVE | 
 | 	beq		.Lecbencout | 
 | #endif | 
 | .Lecbencloop: | 
 | 	ld1		{v0.16b}, [x1], #16		/* get next pt block */ | 
 | 	encrypt_block	v0, w3, x2, x5, w6 | 
 | 	st1		{v0.16b}, [x0], #16 | 
 | 	subs		w4, w4, #1 | 
 | 	bne		.Lecbencloop | 
 | .Lecbencout: | 
 | 	FRAME_POP | 
 | 	ret | 
 | AES_ENDPROC(aes_ecb_encrypt) | 
 |  | 
 |  | 
 | AES_ENTRY(aes_ecb_decrypt) | 
 | 	FRAME_PUSH | 
 | 	cbz		w5, .LecbdecloopNx | 
 |  | 
 | 	dec_prepare	w3, x2, x5 | 
 |  | 
 | .LecbdecloopNx: | 
 | #if INTERLEAVE >= 2 | 
 | 	subs		w4, w4, #INTERLEAVE | 
 | 	bmi		.Lecbdec1x | 
 | #if INTERLEAVE == 2 | 
 | 	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */ | 
 | 	do_decrypt_block2x | 
 | 	st1		{v0.16b-v1.16b}, [x0], #32 | 
 | #else | 
 | 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */ | 
 | 	do_decrypt_block4x | 
 | 	st1		{v0.16b-v3.16b}, [x0], #64 | 
 | #endif | 
 | 	b		.LecbdecloopNx | 
 | .Lecbdec1x: | 
 | 	adds		w4, w4, #INTERLEAVE | 
 | 	beq		.Lecbdecout | 
 | #endif | 
 | .Lecbdecloop: | 
 | 	ld1		{v0.16b}, [x1], #16		/* get next ct block */ | 
 | 	decrypt_block	v0, w3, x2, x5, w6 | 
 | 	st1		{v0.16b}, [x0], #16 | 
 | 	subs		w4, w4, #1 | 
 | 	bne		.Lecbdecloop | 
 | .Lecbdecout: | 
 | 	FRAME_POP | 
 | 	ret | 
 | AES_ENDPROC(aes_ecb_decrypt) | 
 |  | 
 |  | 
 | 	/* | 
 | 	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | 
 | 	 *		   int blocks, u8 iv[], int first) | 
 | 	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | 
 | 	 *		   int blocks, u8 iv[], int first) | 
 | 	 */ | 
 |  | 
 | AES_ENTRY(aes_cbc_encrypt) | 
 | 	cbz		w6, .Lcbcencloop | 
 |  | 
 | 	ld1		{v0.16b}, [x5]			/* get iv */ | 
 | 	enc_prepare	w3, x2, x5 | 
 |  | 
 | .Lcbcencloop: | 
 | 	ld1		{v1.16b}, [x1], #16		/* get next pt block */ | 
 | 	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with iv */ | 
 | 	encrypt_block	v0, w3, x2, x5, w6 | 
 | 	st1		{v0.16b}, [x0], #16 | 
 | 	subs		w4, w4, #1 | 
 | 	bne		.Lcbcencloop | 
 | 	ret | 
 | AES_ENDPROC(aes_cbc_encrypt) | 
 |  | 
 |  | 
 | AES_ENTRY(aes_cbc_decrypt) | 
 | 	FRAME_PUSH | 
 | 	cbz		w6, .LcbcdecloopNx | 
 |  | 
 | 	ld1		{v7.16b}, [x5]			/* get iv */ | 
 | 	dec_prepare	w3, x2, x5 | 
 |  | 
 | .LcbcdecloopNx: | 
 | #if INTERLEAVE >= 2 | 
 | 	subs		w4, w4, #INTERLEAVE | 
 | 	bmi		.Lcbcdec1x | 
 | #if INTERLEAVE == 2 | 
 | 	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */ | 
 | 	mov		v2.16b, v0.16b | 
 | 	mov		v3.16b, v1.16b | 
 | 	do_decrypt_block2x | 
 | 	eor		v0.16b, v0.16b, v7.16b | 
 | 	eor		v1.16b, v1.16b, v2.16b | 
 | 	mov		v7.16b, v3.16b | 
 | 	st1		{v0.16b-v1.16b}, [x0], #32 | 
 | #else | 
 | 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */ | 
 | 	mov		v4.16b, v0.16b | 
 | 	mov		v5.16b, v1.16b | 
 | 	mov		v6.16b, v2.16b | 
 | 	do_decrypt_block4x | 
 | 	sub		x1, x1, #16 | 
 | 	eor		v0.16b, v0.16b, v7.16b | 
 | 	eor		v1.16b, v1.16b, v4.16b | 
 | 	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */ | 
 | 	eor		v2.16b, v2.16b, v5.16b | 
 | 	eor		v3.16b, v3.16b, v6.16b | 
 | 	st1		{v0.16b-v3.16b}, [x0], #64 | 
 | #endif | 
 | 	b		.LcbcdecloopNx | 
 | .Lcbcdec1x: | 
 | 	adds		w4, w4, #INTERLEAVE | 
 | 	beq		.Lcbcdecout | 
 | #endif | 
 | .Lcbcdecloop: | 
 | 	ld1		{v1.16b}, [x1], #16		/* get next ct block */ | 
 | 	mov		v0.16b, v1.16b			/* ...and copy to v0 */ | 
 | 	decrypt_block	v0, w3, x2, x5, w6 | 
 | 	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */ | 
 | 	mov		v7.16b, v1.16b			/* ct is next iv */ | 
 | 	st1		{v0.16b}, [x0], #16 | 
 | 	subs		w4, w4, #1 | 
 | 	bne		.Lcbcdecloop | 
 | .Lcbcdecout: | 
 | 	FRAME_POP | 
 | 	ret | 
 | AES_ENDPROC(aes_cbc_decrypt) | 
 |  | 
 |  | 
 | 	/* | 
 | 	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | 
 | 	 *		   int blocks, u8 ctr[], int first) | 
 | 	 */ | 
 |  | 
 | AES_ENTRY(aes_ctr_encrypt) | 
 | 	FRAME_PUSH | 
 | 	cbnz		w6, .Lctrfirst		/* 1st time around? */ | 
 | 	umov		x5, v4.d[1]		/* keep swabbed ctr in reg */ | 
 | 	rev		x5, x5 | 
 | #if INTERLEAVE >= 2 | 
 | 	cmn		w5, w4			/* 32 bit overflow? */ | 
 | 	bcs		.Lctrinc | 
 | 	add		x5, x5, #1		/* increment BE ctr */ | 
 | 	b		.LctrincNx | 
 | #else | 
 | 	b		.Lctrinc | 
 | #endif | 
 | .Lctrfirst: | 
 | 	enc_prepare	w3, x2, x6 | 
 | 	ld1		{v4.16b}, [x5] | 
 | 	umov		x5, v4.d[1]		/* keep swabbed ctr in reg */ | 
 | 	rev		x5, x5 | 
 | #if INTERLEAVE >= 2 | 
 | 	cmn		w5, w4			/* 32 bit overflow? */ | 
 | 	bcs		.Lctrloop | 
 | .LctrloopNx: | 
 | 	subs		w4, w4, #INTERLEAVE | 
 | 	bmi		.Lctr1x | 
 | #if INTERLEAVE == 2 | 
 | 	mov		v0.8b, v4.8b | 
 | 	mov		v1.8b, v4.8b | 
 | 	rev		x7, x5 | 
 | 	add		x5, x5, #1 | 
 | 	ins		v0.d[1], x7 | 
 | 	rev		x7, x5 | 
 | 	add		x5, x5, #1 | 
 | 	ins		v1.d[1], x7 | 
 | 	ld1		{v2.16b-v3.16b}, [x1], #32	/* get 2 input blocks */ | 
 | 	do_encrypt_block2x | 
 | 	eor		v0.16b, v0.16b, v2.16b | 
 | 	eor		v1.16b, v1.16b, v3.16b | 
 | 	st1		{v0.16b-v1.16b}, [x0], #32 | 
 | #else | 
 | 	ldr		q8, =0x30000000200000001	/* addends 1,2,3[,0] */ | 
 | 	dup		v7.4s, w5 | 
 | 	mov		v0.16b, v4.16b | 
 | 	add		v7.4s, v7.4s, v8.4s | 
 | 	mov		v1.16b, v4.16b | 
 | 	rev32		v8.16b, v7.16b | 
 | 	mov		v2.16b, v4.16b | 
 | 	mov		v3.16b, v4.16b | 
 | 	mov		v1.s[3], v8.s[0] | 
 | 	mov		v2.s[3], v8.s[1] | 
 | 	mov		v3.s[3], v8.s[2] | 
 | 	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */ | 
 | 	do_encrypt_block4x | 
 | 	eor		v0.16b, v5.16b, v0.16b | 
 | 	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */ | 
 | 	eor		v1.16b, v6.16b, v1.16b | 
 | 	eor		v2.16b, v7.16b, v2.16b | 
 | 	eor		v3.16b, v5.16b, v3.16b | 
 | 	st1		{v0.16b-v3.16b}, [x0], #64 | 
 | 	add		x5, x5, #INTERLEAVE | 
 | #endif | 
 | 	cbz		w4, .LctroutNx | 
 | .LctrincNx: | 
 | 	rev		x7, x5 | 
 | 	ins		v4.d[1], x7 | 
 | 	b		.LctrloopNx | 
 | .LctroutNx: | 
 | 	sub		x5, x5, #1 | 
 | 	rev		x7, x5 | 
 | 	ins		v4.d[1], x7 | 
 | 	b		.Lctrout | 
 | .Lctr1x: | 
 | 	adds		w4, w4, #INTERLEAVE | 
 | 	beq		.Lctrout | 
 | #endif | 
 | .Lctrloop: | 
 | 	mov		v0.16b, v4.16b | 
 | 	encrypt_block	v0, w3, x2, x6, w7 | 
 | 	subs		w4, w4, #1 | 
 | 	bmi		.Lctrhalfblock		/* blocks < 0 means 1/2 block */ | 
 | 	ld1		{v3.16b}, [x1], #16 | 
 | 	eor		v3.16b, v0.16b, v3.16b | 
 | 	st1		{v3.16b}, [x0], #16 | 
 | 	beq		.Lctrout | 
 | .Lctrinc: | 
 | 	adds		x5, x5, #1		/* increment BE ctr */ | 
 | 	rev		x7, x5 | 
 | 	ins		v4.d[1], x7 | 
 | 	bcc		.Lctrloop		/* no overflow? */ | 
 | 	umov		x7, v4.d[0]		/* load upper word of ctr  */ | 
 | 	rev		x7, x7			/* ... to handle the carry */ | 
 | 	add		x7, x7, #1 | 
 | 	rev		x7, x7 | 
 | 	ins		v4.d[0], x7 | 
 | 	b		.Lctrloop | 
 | .Lctrhalfblock: | 
 | 	ld1		{v3.8b}, [x1] | 
 | 	eor		v3.8b, v0.8b, v3.8b | 
 | 	st1		{v3.8b}, [x0] | 
 | .Lctrout: | 
 | 	FRAME_POP | 
 | 	ret | 
 | AES_ENDPROC(aes_ctr_encrypt) | 
 | 	.ltorg | 
 |  | 
 |  | 
 | 	/* | 
 | 	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, | 
 | 	 *		   int blocks, u8 const rk2[], u8 iv[], int first) | 
 | 	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, | 
 | 	 *		   int blocks, u8 const rk2[], u8 iv[], int first) | 
 | 	 */ | 
 |  | 
 | 	.macro		next_tweak, out, in, const, tmp | 
 | 	sshr		\tmp\().2d,  \in\().2d,   #63 | 
 | 	and		\tmp\().16b, \tmp\().16b, \const\().16b | 
 | 	add		\out\().2d,  \in\().2d,   \in\().2d | 
 | 	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8 | 
 | 	eor		\out\().16b, \out\().16b, \tmp\().16b | 
 | 	.endm | 
 |  | 
 | .Lxts_mul_x: | 
 | 	.word		1, 0, 0x87, 0 | 
 |  | 
 | AES_ENTRY(aes_xts_encrypt) | 
 | 	FRAME_PUSH | 
 | 	cbz		w7, .LxtsencloopNx | 
 |  | 
 | 	ld1		{v4.16b}, [x6] | 
 | 	enc_prepare	w3, x5, x6 | 
 | 	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */ | 
 | 	enc_switch_key	w3, x2, x6 | 
 | 	ldr		q7, .Lxts_mul_x | 
 | 	b		.LxtsencNx | 
 |  | 
 | .LxtsencloopNx: | 
 | 	ldr		q7, .Lxts_mul_x | 
 | 	next_tweak	v4, v4, v7, v8 | 
 | .LxtsencNx: | 
 | #if INTERLEAVE >= 2 | 
 | 	subs		w4, w4, #INTERLEAVE | 
 | 	bmi		.Lxtsenc1x | 
 | #if INTERLEAVE == 2 | 
 | 	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */ | 
 | 	next_tweak	v5, v4, v7, v8 | 
 | 	eor		v0.16b, v0.16b, v4.16b | 
 | 	eor		v1.16b, v1.16b, v5.16b | 
 | 	do_encrypt_block2x | 
 | 	eor		v0.16b, v0.16b, v4.16b | 
 | 	eor		v1.16b, v1.16b, v5.16b | 
 | 	st1		{v0.16b-v1.16b}, [x0], #32 | 
 | 	cbz		w4, .LxtsencoutNx | 
 | 	next_tweak	v4, v5, v7, v8 | 
 | 	b		.LxtsencNx | 
 | .LxtsencoutNx: | 
 | 	mov		v4.16b, v5.16b | 
 | 	b		.Lxtsencout | 
 | #else | 
 | 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */ | 
 | 	next_tweak	v5, v4, v7, v8 | 
 | 	eor		v0.16b, v0.16b, v4.16b | 
 | 	next_tweak	v6, v5, v7, v8 | 
 | 	eor		v1.16b, v1.16b, v5.16b | 
 | 	eor		v2.16b, v2.16b, v6.16b | 
 | 	next_tweak	v7, v6, v7, v8 | 
 | 	eor		v3.16b, v3.16b, v7.16b | 
 | 	do_encrypt_block4x | 
 | 	eor		v3.16b, v3.16b, v7.16b | 
 | 	eor		v0.16b, v0.16b, v4.16b | 
 | 	eor		v1.16b, v1.16b, v5.16b | 
 | 	eor		v2.16b, v2.16b, v6.16b | 
 | 	st1		{v0.16b-v3.16b}, [x0], #64 | 
 | 	mov		v4.16b, v7.16b | 
 | 	cbz		w4, .Lxtsencout | 
 | 	b		.LxtsencloopNx | 
 | #endif | 
 | .Lxtsenc1x: | 
 | 	adds		w4, w4, #INTERLEAVE | 
 | 	beq		.Lxtsencout | 
 | #endif | 
 | .Lxtsencloop: | 
 | 	ld1		{v1.16b}, [x1], #16 | 
 | 	eor		v0.16b, v1.16b, v4.16b | 
 | 	encrypt_block	v0, w3, x2, x6, w7 | 
 | 	eor		v0.16b, v0.16b, v4.16b | 
 | 	st1		{v0.16b}, [x0], #16 | 
 | 	subs		w4, w4, #1 | 
 | 	beq		.Lxtsencout | 
 | 	next_tweak	v4, v4, v7, v8 | 
 | 	b		.Lxtsencloop | 
 | .Lxtsencout: | 
 | 	FRAME_POP | 
 | 	ret | 
 | AES_ENDPROC(aes_xts_encrypt) | 
 |  | 
 |  | 
 | AES_ENTRY(aes_xts_decrypt) | 
 | 	FRAME_PUSH | 
 | 	cbz		w7, .LxtsdecloopNx | 
 |  | 
 | 	ld1		{v4.16b}, [x6] | 
 | 	enc_prepare	w3, x5, x6 | 
 | 	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */ | 
 | 	dec_prepare	w3, x2, x6 | 
 | 	ldr		q7, .Lxts_mul_x | 
 | 	b		.LxtsdecNx | 
 |  | 
 | .LxtsdecloopNx: | 
 | 	ldr		q7, .Lxts_mul_x | 
 | 	next_tweak	v4, v4, v7, v8 | 
 | .LxtsdecNx: | 
 | #if INTERLEAVE >= 2 | 
 | 	subs		w4, w4, #INTERLEAVE | 
 | 	bmi		.Lxtsdec1x | 
 | #if INTERLEAVE == 2 | 
 | 	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */ | 
 | 	next_tweak	v5, v4, v7, v8 | 
 | 	eor		v0.16b, v0.16b, v4.16b | 
 | 	eor		v1.16b, v1.16b, v5.16b | 
 | 	do_decrypt_block2x | 
 | 	eor		v0.16b, v0.16b, v4.16b | 
 | 	eor		v1.16b, v1.16b, v5.16b | 
 | 	st1		{v0.16b-v1.16b}, [x0], #32 | 
 | 	cbz		w4, .LxtsdecoutNx | 
 | 	next_tweak	v4, v5, v7, v8 | 
 | 	b		.LxtsdecNx | 
 | .LxtsdecoutNx: | 
 | 	mov		v4.16b, v5.16b | 
 | 	b		.Lxtsdecout | 
 | #else | 
 | 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */ | 
 | 	next_tweak	v5, v4, v7, v8 | 
 | 	eor		v0.16b, v0.16b, v4.16b | 
 | 	next_tweak	v6, v5, v7, v8 | 
 | 	eor		v1.16b, v1.16b, v5.16b | 
 | 	eor		v2.16b, v2.16b, v6.16b | 
 | 	next_tweak	v7, v6, v7, v8 | 
 | 	eor		v3.16b, v3.16b, v7.16b | 
 | 	do_decrypt_block4x | 
 | 	eor		v3.16b, v3.16b, v7.16b | 
 | 	eor		v0.16b, v0.16b, v4.16b | 
 | 	eor		v1.16b, v1.16b, v5.16b | 
 | 	eor		v2.16b, v2.16b, v6.16b | 
 | 	st1		{v0.16b-v3.16b}, [x0], #64 | 
 | 	mov		v4.16b, v7.16b | 
 | 	cbz		w4, .Lxtsdecout | 
 | 	b		.LxtsdecloopNx | 
 | #endif | 
 | .Lxtsdec1x: | 
 | 	adds		w4, w4, #INTERLEAVE | 
 | 	beq		.Lxtsdecout | 
 | #endif | 
 | .Lxtsdecloop: | 
 | 	ld1		{v1.16b}, [x1], #16 | 
 | 	eor		v0.16b, v1.16b, v4.16b | 
 | 	decrypt_block	v0, w3, x2, x6, w7 | 
 | 	eor		v0.16b, v0.16b, v4.16b | 
 | 	st1		{v0.16b}, [x0], #16 | 
 | 	subs		w4, w4, #1 | 
 | 	beq		.Lxtsdecout | 
 | 	next_tweak	v4, v4, v7, v8 | 
 | 	b		.Lxtsdecloop | 
 | .Lxtsdecout: | 
 | 	FRAME_POP | 
 | 	ret | 
 | AES_ENDPROC(aes_xts_decrypt) |