#include "arm_arch.h"

#if __ARM_MAX_ARCH__>=8
.arch	armv8-a+crypto
.text
.globl	unroll8_eor3_aes_gcm_enc_128_kernel
.type	unroll8_eor3_aes_gcm_enc_128_kernel,%function
.align	4
unroll8_eor3_aes_gcm_enc_128_kernel:
	AARCH64_VALID_CALL_TARGET
	cbz	x1, .L128_enc_ret
	stp	d8, d9, [sp, #-80]!
	lsr	x9, x1, #3
	mov	x16, x4
	mov	x8, x5
	stp	d10, d11, [sp, #16]
	stp	d12, d13, [sp, #32]
	stp	d14, d15, [sp, #48]
	mov	x5, #0xc200000000000000
	stp	x5, xzr, [sp, #64]
	add	x10, sp, #64

	mov	x15, #0x100000000				//set up counter increment
	movi	v31.16b, #0x0
	mov	v31.d[1], x15
	mov	x5, x9
	ld1	{ v0.16b}, [x16]					//CTR block 0

	sub	x5, x5, #1	 	//byte_len - 1

	and	x5, x5, #0xffffffffffffff80		//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)

	rev32	v30.16b, v0.16b				//set up reversed counter

	add	v30.4s, v30.4s, v31.4s		//CTR block 0

	rev32	v1.16b, v30.16b				//CTR block 1
	add	v30.4s, v30.4s, v31.4s		//CTR block 1

	rev32	v2.16b, v30.16b				//CTR block 2
	add	v30.4s, v30.4s, v31.4s		//CTR block 2

	rev32	v3.16b, v30.16b				//CTR block 3
	add	v30.4s, v30.4s, v31.4s		//CTR block 3

	rev32	v4.16b, v30.16b				//CTR block 4
	add	v30.4s, v30.4s, v31.4s		//CTR block 4

	rev32	v5.16b, v30.16b				//CTR block 5
	add	v30.4s, v30.4s, v31.4s		//CTR block 5
	ldp	q26, q27, [x8, #0]				  	//load rk0, rk1

	rev32	v6.16b, v30.16b				//CTR block 6
	add	v30.4s, v30.4s, v31.4s		//CTR block 6

	rev32	v7.16b, v30.16b				//CTR block 7
	add	v30.4s, v30.4s, v31.4s		//CTR block 7

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 0
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 0

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 0

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 0
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
	ldp	q28, q26, [x8, #32]				//load rk2, rk3

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 1

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 1
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 1

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 1
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 1

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 2

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 2

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 2

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 3

	ldp	q27, q28, [x8, #64]				//load rk4, rk5
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 3

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 3
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 3

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 3

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 4

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 4

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 4

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
	ldp	q26, q27, [x8, #96]				//load rk6, rk7

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 5

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 5

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 6

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 6

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
	ldp	q28, q26, [x8, #128]				//load rk8, rk9

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 7

	ld1	{ v19.16b}, [x3]
	ext	v19.16b, v19.16b, v19.16b, #8
	rev64	v19.16b, v19.16b

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 7

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 7

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 7

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
	ldr	q27, [x8, #160]					//load rk10

	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9

	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
	add	x5, x5, x0
	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9

	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9
	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9
	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9

	add	x4, x0, x1, lsr #3		//end_input_ptr
	cmp	x0, x5				//check if we have <= 8 blocks
	b.ge	.L128_enc_tail						//handle tail

	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load plaintext

	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load plaintext

	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load plaintext

	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load plaintext
	cmp	x0, x5				//check if we have <= 8 blocks

.inst	0xce006d08	//eor3 v8.16b, v8.16b, v0.16b, v27.16b				//AES block 0 - result
	rev32	v0.16b, v30.16b				//CTR block 8
	add	v30.4s, v30.4s, v31.4s		//CTR block 8

.inst	0xce016d29	//eor3 v9.16b, v9.16b, v1.16b, v27.16b				//AES block 1 - result
	stp	q8, q9, [x2], #32			//AES block 0, 1 - store result

	rev32	v1.16b, v30.16b				//CTR block 9
.inst	0xce056dad	//eor3 v13.16b, v13.16b, v5.16b, v27.16b				//AES block 5 - result
	add	v30.4s, v30.4s, v31.4s		//CTR block 9

.inst	0xce026d4a	//eor3 v10.16b, v10.16b, v2.16b, v27.16b				//AES block 2 - result
.inst	0xce066dce	//eor3 v14.16b, v14.16b, v6.16b, v27.16b				//AES block 6 - result
.inst	0xce046d8c	//eor3 v12.16b, v12.16b, v4.16b, v27.16b				//AES block 4 - result

	rev32	v2.16b, v30.16b				//CTR block 10
	add	v30.4s, v30.4s, v31.4s		//CTR block 10

.inst	0xce036d6b	//eor3 v11.16b, v11.16b, v3.16b, v27.16b				//AES block 3 - result
.inst	0xce076def	//eor3 v15.16b, v15.16b, v7.16b,v27.16b				//AES block 7 - result
	stp	q10, q11, [x2], #32			//AES block 2, 3 - store result

	rev32	v3.16b, v30.16b				//CTR block 11
	add	v30.4s, v30.4s, v31.4s		//CTR block 11
	stp	q12, q13, [x2], #32			//AES block 4, 5 - store result

	stp	q14, q15, [x2], #32			//AES block 6, 7 - store result

	rev32	v4.16b, v30.16b				//CTR block 12
	add	v30.4s, v30.4s, v31.4s		//CTR block 12
	b.ge	.L128_enc_prepretail					//do prepretail

.L128_enc_main_loop:	//main	loop start
	rev32	v5.16b, v30.16b				//CTR block 8k+13
	ldr	q20, [x3, #128]				//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #160]				//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13

	rev64	v9.16b, v9.16b						//GHASH block 8k+1
	rev64	v8.16b, v8.16b						//GHASH block 8k
	ldr	q23, [x3, #176]				//load h7l | h7h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #208]				//load h8l | h8h
	ext	v25.16b, v25.16b, v25.16b, #8

	rev32	v6.16b, v30.16b				//CTR block 8k+14
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0

	ldr	q21, [x3, #144]				//load h6k | h5k
	ldr	q24, [x3, #192]				//load h8k | h7k
	rev64	v13.16b, v13.16b						//GHASH block 8k+5 (t0, t1, t2 and t3 free)
	rev64	v11.16b, v11.16b						//GHASH block 8k+3

	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
	rev32	v7.16b, v30.16b				//CTR block 8k+15

	rev64	v15.16b, v15.16b						//GHASH block 8k+7 (t0, t1, t2 and t3 free)

	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
	rev64	v10.16b, v10.16b						//GHASH block 8k+2
	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high

	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low

	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high

	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #112]				//load h3l | h3h
	ext	v25.16b, v25.16b, v25.16b, #8
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high

	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1

.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b,v9.16b			//GHASH block 8k+2, 8k+3 - high
	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid

	ldp	q28, q26, [x8, #32]				//load rk2, rk3
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1

	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1

	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid

	rev64	v14.16b, v14.16b						//GHASH block 8k+6 (t0, t1, and t2 free)
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low

	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
	ldr	q21, [x3, #48]				//load h2k | h1k
	ldr	q24, [x3, #96]				//load h4k | h3k
	rev64	v12.16b, v12.16b						//GHASH block 8k+4 (t0, t1, and t2 free)

	ldp	q27, q28, [x8, #64]				//load rk4, rk5
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3

	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #64]				//load h1l | h1h
	ext	v22.16b, v22.16b, v22.16b, #8
	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low

	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4

	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
	ldp	q26, q27, [x8, #96]				//load rk6, rk7
	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid

	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high

	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5

	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid

.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5

	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5

.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
	ldr	d16, [x10]			//MODULO - load modulo constant
	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6

	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6

	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load plaintext

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
	rev32	v20.16b, v30.16b					//CTR block 8k+16
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6

.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
	ldp	q28, q26, [x8, #128]				//load rk8, rk9
.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load plaintext

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7

	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7

	rev32	v22.16b, v30.16b					//CTR block 8k+17
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load plaintext
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
	ldr	q27, [x8, #160]					//load rk10

	ext	v29.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
	rev32	v23.16b, v30.16b					//CTR block 8k+18
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8

	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9
	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9

	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load plaintext
	rev32	v25.16b, v30.16b					//CTR block 8k+19
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19

	cmp	x0, x5				//.LOOP CONTROL
.inst	0xce046d8c	//eor3 v12.16b, v12.16b, v4.16b, v27.16b				//AES block 4 - result
	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9

	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9
	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9

.inst	0xce026d4a	//eor3 v10.16b, v10.16b, v2.16b, v27.16b				//AES block 8k+10 - result

	mov	v2.16b, v23.16b					//CTR block 8k+18
	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9

	rev32	v4.16b, v30.16b				//CTR block 8k+20
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20

.inst	0xce076def	//eor3 v15.16b, v15.16b, v7.16b, v27.16b				//AES block 7 - result
	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9
	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low

.inst	0xce016d29	//eor3 v9.16b, v9.16b, v1.16b, v27.16b				//AES block 8k+9 - result
.inst	0xce036d6b	//eor3 v11.16b, v11.16b, v3.16b, v27.16b				//AES block 8k+11 - result
	mov	v3.16b, v25.16b					//CTR block 8k+19

	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
.inst	0xce056dad	//eor3 v13.16b, v13.16b, v5.16b, v27.16b				//AES block 5 - result
	mov	v1.16b, v22.16b					//CTR block 8k+17

.inst	0xce006d08	//eor3 v8.16b, v8.16b, v0.16b, v27.16b				//AES block 8k+8 - result
	mov	v0.16b, v20.16b					//CTR block 8k+16
	stp	q8, q9, [x2], #32			//AES block 8k+8, 8k+9 - store result

	stp	q10, q11, [x2], #32			//AES block 8k+10, 8k+11 - store result
.inst	0xce066dce	//eor3 v14.16b, v14.16b, v6.16b, v27.16b				//AES block 6 - result

	stp	q12, q13, [x2], #32			//AES block 8k+12, 8k+13 - store result
.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low

	stp	q14, q15, [x2], #32			//AES block 8k+14, 8k+15 - store result
	b.lt	.L128_enc_main_loop

.L128_enc_prepretail:	//PREPRETAIL
	rev32	v5.16b, v30.16b				//CTR block 8k+13
	ldr	q23, [x3, #176]				//load h7l | h7h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #208]				//load h8l | h8h
	ext	v25.16b, v25.16b, v25.16b, #8
	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0

	ldr	q20, [x3, #128]				//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #160]				//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8
	rev64	v8.16b, v8.16b						//GHASH block 8k
	rev64	v9.16b, v9.16b						//GHASH block 8k+1

	ldr	q21, [x3, #144]				//load h6k | h5k
	ldr	q24, [x3, #192]				//load h6k | h5k
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
	rev64	v11.16b, v11.16b						//GHASH block 8k+3

	rev64	v10.16b, v10.16b						//GHASH block 8k+2
	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1

	rev32	v6.16b, v30.16b				//CTR block 8k+14

	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high

	rev64	v13.16b, v13.16b						//GHASH block 8k+5 (t0, t1, t2 and t3 free)
	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid

	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid

	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid

	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14

	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid

	rev64	v12.16b, v12.16b						//GHASH block 8k+4 (t0, t1, and t2 free)
	rev64	v15.16b, v15.16b						//GHASH block 8k+7 (t0, t1, t2 and t3 free)

	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid

	rev32	v7.16b, v30.16b				//CTR block 8k+15

	rev64	v14.16b, v14.16b						//GHASH block 8k+6 (t0, t1, and t2 free)

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0

	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0

	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0

.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0

	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low

	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8

	ldp	q28, q26, [x8, #32]				//load rk2, rk3
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid

.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1

.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
	ldr	q21, [x3, #48]				//load h2k | h1k
	ldr	q24, [x3, #96]				//load h4k | h3k
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
	ldp	q27, q28, [x8, #64]				//load rk4, rk5

	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #64]				//load h1l | h1h
	ext	v22.16b, v22.16b, v22.16b, #8
	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3

	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3

	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid

	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high

	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high

.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4

	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4

	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
	ldp	q26, q27, [x8, #96]				//load rk6, rk7
	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low

.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
	ldr	d16, [x10]			//MODULO - load modulo constant

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5

.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6

	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
	ldp	q28, q26, [x8, #128]				//load rk8, rk9

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
	ext	v29.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7

	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
	ext	v18.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
.inst	0xce114a73	//eor3 v19.16b, v19.16b, v17.16b, v18.16b		 	//MODULO - fold into low
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8

	ldr	q27, [x8, #160]					//load rk10
	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9
	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9

	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9
	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9

	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9
	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9

	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9
.L128_enc_tail:	//TAIL

	sub	x5, x4, x0 	//main_end_input_ptr is number of bytes left to process
	ldr	q8, [x0], #16				//AES block 8k+8 - load plaintext

	mov	v29.16b, v27.16b
	ldp	q20, q21, [x3, #128]			//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8

.inst	0xce007509	//eor3 v9.16b, v8.16b, v0.16b, v29.16b			//AES block 8k+8 - result
	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
	ldp	q22, q23, [x3, #160]			//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8
	ext	v23.16b, v23.16b, v23.16b, #8

	ldp	q24, q25, [x3, #192]			//load h8k | h7k
	ext	v25.16b, v25.16b, v25.16b, #8
	cmp	x5, #112
	b.gt	.L128_enc_blocks_more_than_7

	mov	v7.16b, v6.16b
	mov	v6.16b, v5.16b
	movi	v17.8b, #0

	cmp	x5, #96
	sub	v30.4s, v30.4s, v31.4s
	mov	v5.16b, v4.16b

	mov	v4.16b, v3.16b
	mov	v3.16b, v2.16b
	mov	v2.16b, v1.16b

	movi	v19.8b, #0
	movi	v18.8b, #0
	b.gt	.L128_enc_blocks_more_than_6

	mov	v7.16b, v6.16b
	cmp	x5, #80

	sub	v30.4s, v30.4s, v31.4s
	mov	v6.16b, v5.16b
	mov	v5.16b, v4.16b

	mov	v4.16b, v3.16b
	mov	v3.16b, v1.16b
	b.gt	.L128_enc_blocks_more_than_5

	cmp	x5, #64
	sub	v30.4s, v30.4s, v31.4s

	mov	v7.16b, v6.16b
	mov	v6.16b, v5.16b

	mov	v5.16b, v4.16b
	mov	v4.16b, v1.16b
	b.gt	.L128_enc_blocks_more_than_4

	mov	v7.16b, v6.16b
	sub	v30.4s, v30.4s, v31.4s
	mov	v6.16b, v5.16b

	mov	v5.16b, v1.16b
	cmp	x5, #48
	b.gt	.L128_enc_blocks_more_than_3

	sub	v30.4s, v30.4s, v31.4s
	mov	v7.16b, v6.16b
	mov	v6.16b, v1.16b

	cmp	x5, #32
	ldr	q24, [x3, #96]					//load h4k | h3k
	b.gt	.L128_enc_blocks_more_than_2

	cmp	x5, #16

	sub	v30.4s, v30.4s, v31.4s
	mov	v7.16b, v1.16b
	b.gt	.L128_enc_blocks_more_than_1

	ldr	q21, [x3, #48]					//load h2k | h1k
	sub	v30.4s, v30.4s, v31.4s
	b	.L128_enc_blocks_less_than_1
.L128_enc_blocks_more_than_7:	//blocks	left >  7
	st1	{ v9.16b}, [x2], #16				//AES final-7 block  - store result

	rev64	v8.16b, v9.16b						//GHASH final-7 block
	ldr	q9, [x0], #16				//AES final-6 block - load plaintext

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid

	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high

	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
	movi	v16.8b, #0						//suppress further partial tag feed in

.inst	0xce017529	//eor3 v9.16b, v9.16b, v1.16b, v29.16b			//AES final-6 block - result

	pmull	v18.1q, v27.1d, v18.1d				//GHASH final-7 block - mid
	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
.L128_enc_blocks_more_than_6:	//blocks	left >  6

	st1	{ v9.16b}, [x2], #16				//AES final-6 block - store result

	rev64	v8.16b, v9.16b						//GHASH final-6 block
	ldr	q9, [x0], #16				//AES final-5 block - load plaintext

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid

.inst	0xce027529	//eor3 v9.16b, v9.16b, v2.16b, v29.16b			//AES final-5 block - result
	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid
	movi	v16.8b, #0						//suppress further partial tag feed in

	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
.L128_enc_blocks_more_than_5:	//blocks	left >  5

	st1	{ v9.16b}, [x2], #16				//AES final-5 block - store result

	rev64	v8.16b, v9.16b						//GHASH final-5 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid
	ldr	q9, [x0], #16				//AES final-4 block - load plaintext
	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid

	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid

.inst	0xce037529	//eor3 v9.16b, v9.16b, v3.16b, v29.16b			//AES final-4 block - result
	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
	movi	v16.8b, #0						//suppress further partial tag feed in

	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
.L128_enc_blocks_more_than_4:	//blocks	left >  4

	st1	{ v9.16b}, [x2], #16			  	//AES final-4 block - store result

	rev64	v8.16b, v9.16b						//GHASH final-4 block

	ldr	q9, [x0], #16				//AES final-3 block - load plaintext

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
	movi	v16.8b, #0						//suppress further partial tag feed in
	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid

	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low

.inst	0xce047529	//eor3 v9.16b, v9.16b, v4.16b, v29.16b			//AES final-3 block - result
	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
.L128_enc_blocks_more_than_3:	//blocks	left >  3

	st1	{ v9.16b}, [x2], #16			  	//AES final-3 block - store result

	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8

	rev64	v8.16b, v9.16b						//GHASH final-3 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
	movi	v16.8b, #0						//suppress further partial tag feed in

	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
	ldr	q24, [x3, #96]				//load h4k | h3k
	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low

	ldr	q9, [x0], #16				//AES final-2 block - load plaintext

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid

	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low

.inst	0xce057529	//eor3 v9.16b, v9.16b, v5.16b, v29.16b			//AES final-2 block - result

	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
.L128_enc_blocks_more_than_2:	//blocks	left >  2

	st1	{ v9.16b}, [x2], #16			  	//AES final-2 block - store result

	rev64	v8.16b, v9.16b						//GHASH final-2 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ldr	q9, [x0], #16				//AES final-1 block - load plaintext

	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid
	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	movi	v16.8b, #0						//suppress further partial tag feed in

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid
.inst	0xce067529	//eor3 v9.16b, v9.16b, v6.16b, v29.16b			//AES final-1 block - result

	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high

	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
.L128_enc_blocks_more_than_1:	//blocks	left >  1

	st1	{ v9.16b}, [x2], #16			  	//AES final-1 block - store result

	ldr	q22, [x3, #64]				//load h2l | h2h
	ext	v22.16b, v22.16b, v22.16b, #8
	rev64	v8.16b, v9.16b						//GHASH final-1 block
	ldr	q9, [x0], #16				//AES final block - load plaintext

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	movi	v16.8b, #0						//suppress further partial tag feed in
	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
.inst	0xce077529	//eor3 v9.16b, v9.16b, v7.16b, v29.16b			//AES final block - result

	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid

	ldr	q21, [x3, #48]				//load h2k | h1k

	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid

	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
.L128_enc_blocks_less_than_1:	//blocks	left <= 1

	rev32	v30.16b, v30.16b
	str	q30, [x16]					//store the updated counter
	and	x1, x1, #127			 	//bit_length %= 128

	sub	x1, x1, #128			 	//bit_length -= 128

	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])

	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
	and	x1, x1, #127			 	//bit_length %= 128

	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff
	cmp	x1, #64

	csel	x13, x7, x6, lt
	csel	x14, x6, xzr, lt

	mov	v0.d[1], x14
	mov	v0.d[0], x13					//ctr0b is mask for last block

	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits

	rev64	v8.16b, v9.16b						//GHASH final block

	bif	v9.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing
	st1	{ v9.16b}, [x2]				//store all 16B

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v16.d[0], v8.d[1]					//GHASH final block - mid

	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid
	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8

	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid

	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
	ldr	d16, [x10]			//MODULO - load modulo constant

	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low

	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high

	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low

	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
	pmull	v29.1q, v17.1d, v16.1d		  	//MODULO - top 64b align with mid

.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		  	//MODULO - karatsuba tidy up

.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b		 	//MODULO - fold into mid

	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
	ext	v21.16b, v18.16b, v18.16b, #8			  	//MODULO - other mid alignment

.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		  	//MODULO - fold into low
	ext	v19.16b, v19.16b, v19.16b, #8
	rev64	v19.16b, v19.16b
	st1	{ v19.16b }, [x3]
	mov	x0, x9

	ldp	d10, d11, [sp, #16]
	ldp	d12, d13, [sp, #32]
	ldp	d14, d15, [sp, #48]
	ldp	d8, d9, [sp], #80
	ret

.L128_enc_ret:
	mov	w0, #0x0
	ret
.size	unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
.globl	unroll8_eor3_aes_gcm_dec_128_kernel
.type	unroll8_eor3_aes_gcm_dec_128_kernel,%function
.align	4
unroll8_eor3_aes_gcm_dec_128_kernel:
	AARCH64_VALID_CALL_TARGET
	cbz	x1, .L128_dec_ret
	stp	d8, d9, [sp, #-80]!
	lsr	x9, x1, #3
	mov	x16, x4
	mov	x8, x5
	stp	d10, d11, [sp, #16]
	stp	d12, d13, [sp, #32]
	stp	d14, d15, [sp, #48]
	mov	x5, #0xc200000000000000
	stp	x5, xzr, [sp, #64]
	add	x10, sp, #64

	mov	x5, x9
	ld1	{ v0.16b}, [x16]					//CTR block 0

	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
	sub	x5, x5, #1		//byte_len - 1

	mov	x15, #0x100000000				//set up counter increment
	movi	v31.16b, #0x0
	mov	v31.d[1], x15
	ld1	{ v19.16b}, [x3]
	ext	v19.16b, v19.16b, v19.16b, #8
	rev64	v19.16b, v19.16b

	rev32	v30.16b, v0.16b				//set up reversed counter

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 0

	add	v30.4s, v30.4s, v31.4s		//CTR block 0

	rev32	v1.16b, v30.16b				//CTR block 1
	add	v30.4s, v30.4s, v31.4s		//CTR block 1

	and	x5, x5, #0xffffffffffffff80	//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)

	rev32	v2.16b, v30.16b				//CTR block 2
	add	v30.4s, v30.4s, v31.4s		//CTR block 2
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 0

	rev32	v3.16b, v30.16b				//CTR block 3
	add	v30.4s, v30.4s, v31.4s		//CTR block 3

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 1
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 1

	rev32	v4.16b, v30.16b				//CTR block 4
	add	v30.4s, v30.4s, v31.4s		//CTR block 4

	rev32	v5.16b, v30.16b				//CTR block 5
	add	v30.4s, v30.4s, v31.4s		//CTR block 5

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 0

	rev32	v6.16b, v30.16b				//CTR block 6
	add	v30.4s, v30.4s, v31.4s		//CTR block 6
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 0

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 0

	rev32	v7.16b, v30.16b				//CTR block 7

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 0
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 1

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 0

	ldp	q28, q26, [x8, #32]				//load rk2, rk3

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 1
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 1

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 1
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 1

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 1

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 2

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 2

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 3
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 3

	ldp	q27, q28, [x8, #64]				//load rk4, rk5
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 3

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 3
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 3

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 3

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 3

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 4

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 4
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 4

	ldp	q26, q27, [x8, #96]				//load rk6, rk7
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 5

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 5

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 5

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 5

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 5

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 6

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 6

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 7

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
	ldp	q28, q26, [x8, #128]				//load rk8, rk9

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 7

	add	x5, x5, x0
	add	v30.4s, v30.4s, v31.4s		//CTR block 7

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 8
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 8

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 8
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 8
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 8

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 8
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 8
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 8

	aese	v0.16b, v26.16b						//AES block 0 - round 9
	aese	v1.16b, v26.16b						//AES block 1 - round 9
	aese	v6.16b, v26.16b						//AES block 6 - round 9

	ldr	q27, [x8, #160]					//load rk10
	aese	v4.16b, v26.16b						//AES block 4 - round 9
	aese	v3.16b, v26.16b						//AES block 3 - round 9

	aese	v2.16b, v26.16b						//AES block 2 - round 9
	aese	v5.16b, v26.16b						//AES block 5 - round 9
	aese	v7.16b, v26.16b						//AES block 7 - round 9

	add	x4, x0, x1, lsr #3		//end_input_ptr
	cmp	x0, x5				//check if we have <= 8 blocks
	b.ge	.L128_dec_tail						//handle tail

	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load ciphertext

.inst	0xce006d00	//eor3 v0.16b, v8.16b, v0.16b, v27.16b				//AES block 0 - result
.inst	0xce016d21	//eor3 v1.16b, v9.16b, v1.16b, v27.16b				//AES block 1 - result
	stp	q0, q1, [x2], #32			//AES block 0, 1 - store result

	rev32	v0.16b, v30.16b				//CTR block 8
	add	v30.4s, v30.4s, v31.4s		//CTR block 8
	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load ciphertext

	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load ciphertext

	rev32	v1.16b, v30.16b				//CTR block 9
	add	v30.4s, v30.4s, v31.4s		//CTR block 9
	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load ciphertext

.inst	0xce036d63	//eor3 v3.16b, v11.16b, v3.16b, v27.16b				//AES block 3 - result
.inst	0xce026d42	//eor3 v2.16b, v10.16b, v2.16b, v27.16b				//AES block 2 - result
	stp	q2, q3, [x2], #32			//AES block 2, 3 - store result

	rev32	v2.16b, v30.16b				//CTR block 10
	add	v30.4s, v30.4s, v31.4s		//CTR block 10

.inst	0xce066dc6	//eor3 v6.16b, v14.16b, v6.16b, v27.16b				//AES block 6 - result

	rev32	v3.16b, v30.16b				//CTR block 11
	add	v30.4s, v30.4s, v31.4s		//CTR block 11

.inst	0xce046d84	//eor3 v4.16b, v12.16b, v4.16b, v27.16b				//AES block 4 - result
.inst	0xce056da5	//eor3 v5.16b, v13.16b, v5.16b, v27.16b				//AES block 5 - result
	stp	q4, q5, [x2], #32			//AES block 4, 5 - store result

.inst	0xce076de7	//eor3 v7.16b, v15.16b, v7.16b, v27.16b				//AES block 7 - result
	stp	q6, q7, [x2], #32			//AES block 6, 7 - store result
	rev32	v4.16b, v30.16b				//CTR block 12

	cmp	x0, x5				//check if we have <= 8 blocks
	add	v30.4s, v30.4s, v31.4s		//CTR block 12
	b.ge	.L128_dec_prepretail					//do prepretail

.L128_dec_main_loop:	//main	loop start
	ldr	q23, [x3, #176]				//load h7l | h7h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #208]				//load h8l | h8h
	ext	v25.16b, v25.16b, v25.16b, #8

	rev64	v9.16b, v9.16b						//GHASH block 8k+1
	rev64	v8.16b, v8.16b						//GHASH block 8k
	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0

	rev64	v14.16b, v14.16b						//GHASH block 8k+6
	ldr	q20, [x3, #128]				//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #160]				//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8

	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
	rev32	v5.16b, v30.16b				//CTR block 8k+13
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13

	rev64	v10.16b, v10.16b						//GHASH block 8k+2
	rev64	v12.16b, v12.16b						//GHASH block 8k+4
	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1

	rev32	v6.16b, v30.16b				//CTR block 8k+14
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
	ldr	q21, [x3, #144]				//load h6k | h5k
	ldr	q24, [x3, #192]				//load h8k | h7k

	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
	rev64	v11.16b, v11.16b						//GHASH block 8k+3

	rev32	v7.16b, v30.16b				//CTR block 8k+15
	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	rev64	v13.16b, v13.16b						//GHASH block 8k+5

	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid

	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high

	ldp	q28, q26, [x8, #32]				//load rk2, rk3
	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1

	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid

	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8
	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #64]				//load h2l | h2h
	ext	v22.16b, v22.16b, v22.16b, #8

	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2

	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
	rev64	v15.16b, v15.16b						//GHASH block 8k+7
	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high

	ldp	q27, q28, [x8, #64]				//load rk4, rk5
	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid

	ldr	q21, [x3, #48]				//load h2k | h1k
	ldr	q24, [x3, #96]				//load h4k | h3k
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3

	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high

	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4

	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid

	ldp	q26, q27, [x8, #96]				//load rk6, rk7
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5

	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5

	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b 			//GHASH block 8k+4, 8k+5 - mid
.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low

	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15

.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
	ldp	q28, q26, [x8, #128]				//load rk8, rk9

	ldr	d16, [x10]			//MODULO - load modulo constant
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7

	rev32	v20.16b, v30.16b					//CTR block 8k+16
.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
	rev32	v22.16b, v30.16b					//CTR block 8k+17

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid

.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load ciphertext

	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load ciphertext
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
	rev32	v23.16b, v30.16b					//CTR block 8k+18

	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load ciphertext
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid

	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load ciphertext
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8

	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9
	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9
	ldr	q27, [x8, #160]					//load rk10

	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9
	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9

	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9
	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment

	rev32	v25.16b, v30.16b					//CTR block 8k+19
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19

	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9
	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9
.inst	0xce016d21	//eor3 v1.16b, v9.16b, v1.16b, v27.16b				//AES block 8k+9 - result

.inst	0xce006d00	//eor3 v0.16b, v8.16b, v0.16b, v27.16b				//AES block 8k+8 - result
.inst	0xce076de7	//eor3 v7.16b, v15.16b, v7.16b, v27.16b				//AES block 8k+15 - result
.inst	0xce066dc6	//eor3 v6.16b, v14.16b, v6.16b, v27.16b				//AES block 8k+14 - result

.inst	0xce026d42	//eor3 v2.16b, v10.16b, v2.16b, v27.16b				//AES block 8k+10 - result
	stp	q0, q1, [x2], #32			//AES block 8k+8, 8k+9 - store result
	mov	v1.16b, v22.16b					//CTR block 8k+17

.inst	0xce046d84	//eor3 v4.16b, v12.16b, v4.16b, v27.16b				//AES block 8k+12 - result
.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
	mov	v0.16b, v20.16b					//CTR block 8k+16

.inst	0xce036d63	//eor3 v3.16b, v11.16b, v3.16b, v27.16b				//AES block 8k+11 - result
	cmp	x0, x5				//.LOOP CONTROL
	stp	q2, q3, [x2], #32			//AES block 8k+10, 8k+11 - store result

.inst	0xce056da5	//eor3 v5.16b, v13.16b, v5.16b, v27.16b				//AES block 8k+13 - result
	mov	v2.16b, v23.16b					//CTR block 8k+18

	stp	q4, q5, [x2], #32			//AES block 8k+12, 8k+13 - store result
	rev32	v4.16b, v30.16b				//CTR block 8k+20
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20

	stp	q6, q7, [x2], #32			//AES block 8k+14, 8k+15 - store result
	mov	v3.16b, v25.16b					//CTR block 8k+19
	b.lt	.L128_dec_main_loop

.L128_dec_prepretail:	//PREPRETAIL
	rev64	v11.16b, v11.16b						//GHASH block 8k+3
	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
	rev64	v8.16b, v8.16b						//GHASH block 8k

	rev64	v10.16b, v10.16b						//GHASH block 8k+2
	rev32	v5.16b, v30.16b				//CTR block 8k+13
	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1

	ldr	q23, [x3, #176]				//load h7l | h7h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #208]				//load h8l | h8h
	ext	v25.16b, v25.16b, v25.16b, #8
	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
	rev64	v9.16b, v9.16b						//GHASH block 8k+1

	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
	ldr	q20, [x3, #128]				//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #160]				//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8
	rev64	v13.16b, v13.16b						//GHASH block 8k+5

	rev64	v12.16b, v12.16b						//GHASH block 8k+4

	rev64	v14.16b, v14.16b						//GHASH block 8k+6

	ldr	q21, [x3, #144]				//load h6k | h5k
	ldr	q24, [x3, #192]				//load h8k | h7k
	rev32	v6.16b, v30.16b				//CTR block 8k+14
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14

	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high

	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high

	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0

	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid

	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
	rev32	v7.16b, v30.16b				//CTR block 8k+15
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0

.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0

	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k - mid
	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0

	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1

	ldp	q28, q26, [x8, #32]				//load rk2, rk3
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid

	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1

	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #64]				//load h2l | h2h
	ext	v22.16b, v22.16b, v22.16b, #8
.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2

	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid

	ldp	q27, q28, [x8, #64]				//load rk4, rk5
	rev64	v15.16b, v15.16b						//GHASH block 8k+7
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3

	ldr	q21, [x3, #48]				//load h2k | h1k
	ldr	q24, [x3, #96]				//load h4k | h3k
	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid

	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid

.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4

	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid

	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid

	ldp	q26, q27, [x8, #96]				//load rk6, rk7
.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5

	ldr	d16, [x10]			//MODULO - load modulo constant
	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6

.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
	ldp	q28, q26, [x8, #128]				//load rk8, rk9

	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7

.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
	ldr	q27, [x8, #160]					//load rk10

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8

	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8

	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8

.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9

	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9
	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9
	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9

	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9
	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9

.L128_dec_tail:	//TAIL

	mov	v29.16b, v27.16b
	sub	x5, x4, x0 	//main_end_input_ptr is number of bytes left to process

	cmp	x5, #112

	ldp	q24, q25, [x3, #192]			//load h8k | h7k
	ext	v25.16b, v25.16b, v25.16b, #8
	ldr	q9, [x0], #16				//AES block 8k+8 - load ciphertext

	ldp	q20, q21, [x3, #128]			//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag

	ldp	q22, q23, [x3, #160]			//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8
	ext	v23.16b, v23.16b, v23.16b, #8

.inst	0xce00752c	//eor3 v12.16b, v9.16b, v0.16b, v29.16b				//AES block 8k+8 - result
	b.gt	.L128_dec_blocks_more_than_7

	cmp	x5, #96
	mov	v7.16b, v6.16b
	movi	v19.8b, #0

	movi	v17.8b, #0
	mov	v6.16b, v5.16b
	mov	v5.16b, v4.16b

	mov	v4.16b, v3.16b
	mov	v3.16b, v2.16b
	mov	v2.16b, v1.16b

	movi	v18.8b, #0
	sub	v30.4s, v30.4s, v31.4s
	b.gt	.L128_dec_blocks_more_than_6

	cmp	x5, #80
	sub	v30.4s, v30.4s, v31.4s

	mov	v7.16b, v6.16b
	mov	v6.16b, v5.16b
	mov	v5.16b, v4.16b

	mov	v4.16b, v3.16b
	mov	v3.16b, v1.16b
	b.gt	.L128_dec_blocks_more_than_5

	cmp	x5, #64

	mov	v7.16b, v6.16b
	mov	v6.16b, v5.16b
	mov	v5.16b, v4.16b

	mov	v4.16b, v1.16b
	sub	v30.4s, v30.4s, v31.4s
	b.gt	.L128_dec_blocks_more_than_4

	sub	v30.4s, v30.4s, v31.4s
	mov	v7.16b, v6.16b
	mov	v6.16b, v5.16b

	mov	v5.16b, v1.16b
	cmp	x5, #48
	b.gt	.L128_dec_blocks_more_than_3

	sub	v30.4s, v30.4s, v31.4s
	mov	v7.16b, v6.16b
	cmp	x5, #32

	ldr	q24, [x3, #96]				//load h4k | h3k
	mov	v6.16b, v1.16b
	b.gt	.L128_dec_blocks_more_than_2

	cmp	x5, #16

	mov	v7.16b, v1.16b
	sub	v30.4s, v30.4s, v31.4s
	b.gt	.L128_dec_blocks_more_than_1

	sub	v30.4s, v30.4s, v31.4s
	ldr	q21, [x3, #48]				//load h2k | h1k
	b	.L128_dec_blocks_less_than_1
.L128_dec_blocks_more_than_7:	//blocks	left >  7
	rev64	v8.16b, v9.16b						//GHASH final-7 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid

	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid

	movi	v16.8b, #0						//suppress further partial tag feed in
	ldr	q9, [x0], #16				//AES final-6 block - load ciphertext

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid

	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
	st1	{ v12.16b}, [x2], #16			 	//AES final-7 block  - store result
.inst	0xce01752c	//eor3 v12.16b, v9.16b, v1.16b, v29.16b				//AES final-6 block - result

	pmull	v18.1q, v27.1d, v18.1d			 	//GHASH final-7 block - mid
.L128_dec_blocks_more_than_6:	//blocks	left >  6

	rev64	v8.16b, v9.16b						//GHASH final-6 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid

	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
	ldr	q9, [x0], #16				//AES final-5 block - load ciphertext
	movi	v16.8b, #0						//suppress further partial tag feed in

	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
	st1	{ v12.16b}, [x2], #16			 	//AES final-6 block - store result
	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
.inst	0xce02752c	//eor3 v12.16b, v9.16b, v2.16b, v29.16b				//AES final-5 block - result
.L128_dec_blocks_more_than_5:	//blocks	left >  5

	rev64	v8.16b, v9.16b						//GHASH final-5 block

	ldr	q9, [x0], #16				//AES final-4 block - load ciphertext
	st1	{ v12.16b}, [x2], #16			 	//AES final-5 block - store result

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid

.inst	0xce03752c	//eor3 v12.16b, v9.16b, v3.16b, v29.16b				//AES final-4 block - result

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid

	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
	movi	v16.8b, #0						//suppress further partial tag feed in

	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
.L128_dec_blocks_more_than_4:	//blocks	left >  4

	rev64	v8.16b, v9.16b						//GHASH final-4 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
	ldr	q9, [x0], #16				//AES final-3 block - load ciphertext

	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
	movi	v16.8b, #0						//suppress further partial tag feed in
	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high

	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high

	st1	{ v12.16b}, [x2], #16			 	//AES final-4 block - store result
	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid

.inst	0xce04752c	//eor3 v12.16b, v9.16b, v4.16b, v29.16b				//AES final-3 block - result
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low

	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
.L128_dec_blocks_more_than_3:	//blocks	left >  3

	st1	{ v12.16b}, [x2], #16			 	//AES final-3 block - store result
	rev64	v8.16b, v9.16b						//GHASH final-3 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid

	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8
	ldr	q24, [x3, #96]				//load h4k | h3k

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid

	ldr	q9, [x0], #16				//AES final-2 block - load ciphertext

	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low
	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high

	movi	v16.8b, #0						//suppress further partial tag feed in
.inst	0xce05752c	//eor3 v12.16b, v9.16b, v5.16b, v29.16b				//AES final-2 block - result
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low

	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
.L128_dec_blocks_more_than_2:	//blocks	left >  2

	rev64	v8.16b, v9.16b						//GHASH final-2 block

	st1	{ v12.16b}, [x2], #16			 	//AES final-2 block - store result

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	movi	v16.8b, #0						//suppress further partial tag feed in

	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid

	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low

	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
	ldr	q9, [x0], #16				//AES final-1 block - load ciphertext

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low

.inst	0xce06752c	//eor3 v12.16b, v9.16b, v6.16b, v29.16b				//AES final-1 block - result
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
.L128_dec_blocks_more_than_1:	//blocks	left >  1

	st1	{ v12.16b}, [x2], #16			 	//AES final-1 block - store result
	rev64	v8.16b, v9.16b						//GHASH final-1 block

	ldr	q22, [x3, #64]				//load h2l | h2h
	ext	v22.16b, v22.16b, v22.16b, #8

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	movi	v16.8b, #0						//suppress further partial tag feed in

	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid

	ldr	q9, [x0], #16				//AES final block - load ciphertext
	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
	ldr	q21, [x3, #48]				//load h2k | h1k

	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid
.inst	0xce07752c	//eor3 v12.16b, v9.16b, v7.16b, v29.16b				//AES final block - result

	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low

	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
.L128_dec_blocks_less_than_1:	//blocks	left <= 1

	and	x1, x1, #127				//bit_length %= 128

	sub	x1, x1, #128				//bit_length -= 128

	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])

	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
	and	x1, x1, #127				//bit_length %= 128

	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
	cmp	x1, #64
	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff

	csel	x13, x7, x6, lt
	csel	x14, x6, xzr, lt

	mov	v0.d[1], x14
	mov	v0.d[0], x13					//ctr0b is mask for last block

	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8
	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored

	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits

	rev64	v8.16b, v9.16b						//GHASH final block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
	ins	v16.d[0], v8.d[1]					//GHASH final block - mid

	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid

	bif	v12.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing

	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid
	st1	{ v12.16b}, [x2]				//store all 16B

	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low

	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
	ldr	d16, [x10]			//MODULO - load modulo constant

	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low

	eor	v14.16b, v17.16b, v19.16b				//MODULO - karatsuba tidy up

	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
	ext	v17.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment

	eor	v18.16b, v18.16b, v14.16b				//MODULO - karatsuba tidy up

.inst	0xce115652	//eor3 v18.16b, v18.16b, v17.16b, v21.16b			//MODULO - fold into mid

	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
	ext	v18.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment

.inst	0xce124673	//eor3 v19.16b, v19.16b, v18.16b, v17.16b			//MODULO - fold into low
	ext	v19.16b, v19.16b, v19.16b, #8
	rev64	v19.16b, v19.16b
	st1	{ v19.16b }, [x3]
	rev32	v30.16b, v30.16b

	str	q30, [x16]					//store the updated counter

	mov	x0, x9

	ldp	d10, d11, [sp, #16]
	ldp	d12, d13, [sp, #32]
	ldp	d14, d15, [sp, #48]
	ldp	d8, d9, [sp], #80
	ret
.L128_dec_ret:
	mov	w0, #0x0
	ret
.size	unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
.globl	unroll8_eor3_aes_gcm_enc_192_kernel
.type	unroll8_eor3_aes_gcm_enc_192_kernel,%function
.align	4
unroll8_eor3_aes_gcm_enc_192_kernel:
	AARCH64_VALID_CALL_TARGET
	cbz	x1, .L192_enc_ret
	stp	d8, d9, [sp, #-80]!
	lsr	x9, x1, #3
	mov	x16, x4
	mov	x8, x5
	stp	d10, d11, [sp, #16]
	stp	d12, d13, [sp, #32]
	stp	d14, d15, [sp, #48]
	mov	x5, #0xc200000000000000
	stp	x5, xzr, [sp, #64]
	add	x10, sp, #64

	mov	x5, x9
	ld1	{ v0.16b}, [x16]					//CTR block 0

	mov	x15, #0x100000000				//set up counter increment
	movi	v31.16b, #0x0
	mov	v31.d[1], x15

	rev32	v30.16b, v0.16b				//set up reversed counter

	add	v30.4s, v30.4s, v31.4s		//CTR block 0

	rev32	v1.16b, v30.16b				//CTR block 1
	add	v30.4s, v30.4s, v31.4s		//CTR block 1

	rev32	v2.16b, v30.16b				//CTR block 2
	add	v30.4s, v30.4s, v31.4s		//CTR block 2

	rev32	v3.16b, v30.16b				//CTR block 3
	add	v30.4s, v30.4s, v31.4s		//CTR block 3

	rev32	v4.16b, v30.16b				//CTR block 4
	add	v30.4s, v30.4s, v31.4s		//CTR block 4
	sub	x5, x5, #1		//byte_len - 1

	and	x5, x5, #0xffffffffffffff80	//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)

	rev32	v5.16b, v30.16b				//CTR block 5
	add	v30.4s, v30.4s, v31.4s		//CTR block 5
	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1

	add	x5, x5, x0

	rev32	v6.16b, v30.16b				//CTR block 6
	add	v30.4s, v30.4s, v31.4s		//CTR block 6

	rev32	v7.16b, v30.16b				//CTR block 7

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 0

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 0

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 0
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 0
	ldp	q28, q26, [x8, #32]				//load rk2, rk3

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 1

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 1
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 1

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 1
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 1

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 2

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 2

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 2

	ldp	q27, q28, [x8, #64]				//load rk4, rk5
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 3

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 3
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 3

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 3

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 3

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 3

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 3

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 4

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 4

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
	ldp	q26, q27, [x8, #96]				//load rk6, rk7
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 5

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 5

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 5

	add	v30.4s, v30.4s, v31.4s		//CTR block 7

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 6

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 6

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
	ldp	q28, q26, [x8, #128]				//load rk8, rk9

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 7

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 7

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 7

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 7

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 8
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 8

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 8
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 8
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 8

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 8
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 8
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 8

	add	x4, x0, x1, lsr #3		//end_input_ptr
	cmp	x0, x5				//check if we have <= 8 blocks
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 9

	ld1	{ v19.16b}, [x3]
	ext	v19.16b, v19.16b, v19.16b, #8
	rev64	v19.16b, v19.16b
	ldp	q27, q28, [x8, #160]				//load rk10, rk11

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 9
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 9

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 9
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 9

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 9
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 9

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 14 - round 10
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 9
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 11 - round 10

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 9 - round 10
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 13 - round 10
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 12 - round 10

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8 - round 10
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 10 - round 10
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 15 - round 10

	aese	v6.16b, v28.16b						//AES block 14 - round 11
	aese	v3.16b, v28.16b						//AES block 11 - round 11

	aese	v4.16b, v28.16b						//AES block 12 - round 11
	aese	v7.16b, v28.16b						//AES block 15 - round 11
	ldr	q26, [x8, #192]					//load rk12

	aese	v1.16b, v28.16b						//AES block 9 - round 11
	aese	v5.16b, v28.16b						//AES block 13 - round 11

	aese	v2.16b, v28.16b						//AES block 10 - round 11
	aese	v0.16b, v28.16b						//AES block 8 - round 11
	b.ge	.L192_enc_tail						//handle tail

	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load plaintext

	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load plaintext

	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load plaintext

	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load plaintext

.inst	0xce006908	//eor3 v8.16b, v8.16b, v0.16b, v26.16b				//AES block 0 - result
	rev32	v0.16b, v30.16b				//CTR block 8
	add	v30.4s, v30.4s, v31.4s		//CTR block 8

.inst	0xce03696b	//eor3 v11.16b, v11.16b, v3.16b, v26.16b				//AES block 3 - result
.inst	0xce016929	//eor3 v9.16b, v9.16b, v1.16b, v26.16b				//AES block 1 - result

	rev32	v1.16b, v30.16b				//CTR block 9
	add	v30.4s, v30.4s, v31.4s		//CTR block 9
.inst	0xce04698c	//eor3 v12.16b, v12.16b, v4.16b, v26.16b				//AES block 4 - result

.inst	0xce0569ad	//eor3 v13.16b, v13.16b, v5.16b, v26.16b				//AES block 5 - result
.inst	0xce0769ef	//eor3 v15.16b, v15.16b, v7.16b, v26.16b				//AES block 7 - result
	stp	q8, q9, [x2], #32			//AES block 0, 1 - store result

.inst	0xce02694a	//eor3 v10.16b, v10.16b, v2.16b, v26.16b				//AES block 2 - result
	rev32	v2.16b, v30.16b				//CTR block 10
	add	v30.4s, v30.4s, v31.4s		//CTR block 10

	stp	q10, q11, [x2], #32			//AES block 2, 3 - store result
	cmp	x0, x5				//check if we have <= 8 blocks

	rev32	v3.16b, v30.16b				//CTR block 11
	add	v30.4s, v30.4s, v31.4s		//CTR block 11
.inst	0xce0669ce	//eor3 v14.16b, v14.16b, v6.16b, v26.16b				//AES block 6 - result

	stp	q12, q13, [x2], #32			//AES block 4, 5 - store result

	rev32	v4.16b, v30.16b				//CTR block 12
	stp	q14, q15, [x2], #32			//AES block 6, 7 - store result
	add	v30.4s, v30.4s, v31.4s		//CTR block 12

	b.ge	.L192_enc_prepretail					//do prepretail

.L192_enc_main_loop:	//main	loop start
	rev64	v12.16b, v12.16b						//GHASH block 8k+4 (t0, t1, and t2 free)
	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
	rev64	v10.16b, v10.16b						//GHASH block 8k+2

	rev32	v5.16b, v30.16b				//CTR block 8k+13
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
	ldr	q23, [x3, #176]				//load h7l | h7h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #208]				//load h8l | h8h
	ext	v25.16b, v25.16b, v25.16b, #8

	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
	rev64	v8.16b, v8.16b						//GHASH block 8k
	ldr	q20, [x3, #128]				//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #160]				//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8

	rev64	v9.16b, v9.16b						//GHASH block 8k+1
	rev32	v6.16b, v30.16b				//CTR block 8k+14
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14

	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
	rev64	v11.16b, v11.16b						//GHASH block 8k+3
	rev64	v13.16b, v13.16b						//GHASH block 8k+5 (t0, t1, t2 and t3 free)

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
	rev32	v7.16b, v30.16b				//CTR block 8k+15
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0

	ldp	q28, q26, [x8, #32]				//load rk2, rk3
	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low

	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
	ldr	q21, [x3, #144]				//load h6k | h5k
	ldr	q24, [x3, #192]				//load h8k | h7k

	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1

	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1

	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high

	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2

	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
	ldp	q27, q28, [x8, #64]				//load rk4, rk5

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8

	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k - mid
	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid

	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3

	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3

	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4

.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #64]				//load h2l | h2h
	ext	v22.16b, v22.16b, v22.16b, #8

	ldp	q26, q27, [x8, #96]				//load rk6, rk7
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
	rev64	v15.16b, v15.16b						//GHASH block 8k+7 (t0, t1, t2 and t3 free)

	rev64	v14.16b, v14.16b						//GHASH block 8k+6 (t0, t1, and t2 free)
	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
	ldr	q21, [x3, #48]				//load h2k | h1k
	ldr	q24, [x3, #96]				//load h4k | h3k

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5

	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high

	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid

	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
	ldp	q28, q26, [x8, #128]				//load rk8, rk9
	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7

.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15

	ldr	d16, [x10]			//MODULO - load modulo constant
.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7

	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
	ldp	q27, q28, [x8, #160]				//load rk10, rk11

.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
	rev32	v20.16b, v30.16b					//CTR block 8k+16
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load plaintext

	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
	rev32	v22.16b, v30.16b					//CTR block 8k+17
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9

.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
	ldr	q26, [x8, #192]					//load rk12
	ext	v29.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load plaintext

	aese	v4.16b, v28.16b						//AES block 8k+12 - round 11
.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load plaintext

	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load plaintext
	aese	v2.16b, v28.16b						//AES block 8k+10 - round 11
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10

	rev32	v23.16b, v30.16b					//CTR block 8k+18
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
	aese	v5.16b, v28.16b						//AES block 8k+13 - round 11
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18

	aese	v7.16b, v28.16b						//AES block 8k+15 - round 11
	aese	v0.16b, v28.16b						//AES block 8k+8 - round 11
.inst	0xce04698c	//eor3 v12.16b, v12.16b, v4.16b, v26.16b				//AES block 4 - result

	aese	v6.16b, v28.16b						//AES block 8k+14 - round 11
	aese	v3.16b, v28.16b						//AES block 8k+11 - round 11
	aese	v1.16b, v28.16b						//AES block 8k+9 - round 11

	rev32	v25.16b, v30.16b					//CTR block 8k+19
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19
.inst	0xce0769ef	//eor3 v15.16b, v15.16b, v7.16b, v26.16b				//AES block 7 - result

.inst	0xce02694a	//eor3 v10.16b, v10.16b, v2.16b, v26.16b				//AES block 8k+10 - result
.inst	0xce006908	//eor3 v8.16b, v8.16b, v0.16b, v26.16b				//AES block 8k+8 - result
	mov	v2.16b, v23.16b					//CTR block 8k+18

.inst	0xce016929	//eor3 v9.16b, v9.16b, v1.16b, v26.16b				//AES block 8k+9 - result
	mov	v1.16b, v22.16b					//CTR block 8k+17
	stp	q8, q9, [x2], #32			//AES block 8k+8, 8k+9 - store result
	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment

.inst	0xce0669ce	//eor3 v14.16b, v14.16b, v6.16b, v26.16b				//AES block 6 - result
	mov	v0.16b, v20.16b					//CTR block 8k+16
	rev32	v4.16b, v30.16b				//CTR block 8k+20

	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
.inst	0xce0569ad	//eor3 v13.16b, v13.16b, v5.16b, v26.16b				//AES block 5 - result
.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low

.inst	0xce03696b	//eor3 v11.16b, v11.16b, v3.16b, v26.16b				//AES block 8k+11 - result
	mov	v3.16b, v25.16b					//CTR block 8k+19

	stp	q10, q11, [x2], #32			//AES block 8k+10, 8k+11 - store result

	stp	q12, q13, [x2], #32			//AES block 8k+12, 8k+13 - store result

	cmp	x0, x5				//.LOOP CONTROL
	stp	q14, q15, [x2], #32			//AES block 8k+14, 8k+15 - store result
	b.lt	.L192_enc_main_loop

.L192_enc_prepretail:	//PREPRETAIL
	rev32	v5.16b, v30.16b				//CTR block 8k+13
	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13

	ldr	q23, [x3, #176]				//load h7l | h7h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #208]				//load h8l | h8h
	ext	v25.16b, v25.16b, v25.16b, #8
	rev64	v8.16b, v8.16b						//GHASH block 8k
	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0

	rev32	v6.16b, v30.16b				//CTR block 8k+14
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
	ldr	q21, [x3, #144]				//load h6k | h5k
	ldr	q24, [x3, #192]				//load h8k | h7k

	rev64	v11.16b, v11.16b						//GHASH block 8k+3
	rev64	v10.16b, v10.16b						//GHASH block 8k+2
	ldr	q20, [x3, #128]				//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #160]				//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8

	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
	rev32	v7.16b, v30.16b				//CTR block 8k+15
	rev64	v9.16b, v9.16b						//GHASH block 8k+1

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0

	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid

	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
	ldp	q28, q26, [x8, #32]				//load rk2, rk3

	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1

	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low

	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
	rev64	v13.16b, v13.16b						//GHASH block 8k+5 (t0, t1, t2 and t3 free)
	rev64	v14.16b, v14.16b						//GHASH block 8k+6 (t0, t1, and t2 free)

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2

	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
	ldp	q27, q28, [x8, #64]				//load rk4, rk5

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3

	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3

	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid

	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #64]				//load h2l | h2h
	ext	v22.16b, v22.16b, v22.16b, #8
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
	rev64	v12.16b, v12.16b						//GHASH block 8k+4 (t0, t1, and t2 free)

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4

	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4

.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
	rev64	v15.16b, v15.16b						//GHASH block 8k+7 (t0, t1, t2 and t3 free)
	ldr	q21, [x3, #48]				//load h2k | h1k
	ldr	q24, [x3, #96]				//load h4k | h3k

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
	ldp	q26, q27, [x8, #96]				//load rk6, rk7

	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid

	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low

	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5

	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
	ldr	d16, [x10]			//MODULO - load modulo constant
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6

	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low

	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
	ldp	q28, q26, [x8, #128]				//load rk8, rk9

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid

.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high

.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
	ext	v29.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
	ldp	q27, q28, [x8, #160]				//load rk10, rk11
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9

	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9

	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
	ldr	q26, [x8, #192]					//load rk12

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10

.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10

	aese	v1.16b, v28.16b						//AES block 8k+9 - round 11
	aese	v7.16b, v28.16b						//AES block 8k+15 - round 11

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
	aese	v3.16b, v28.16b						//AES block 8k+11 - round 11

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10

	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
	aese	v2.16b, v28.16b						//AES block 8k+10 - round 11
	aese	v0.16b, v28.16b						//AES block 8k+8 - round 11

	aese	v6.16b, v28.16b						//AES block 8k+14 - round 11
	aese	v4.16b, v28.16b						//AES block 8k+12 - round 11
	aese	v5.16b, v28.16b						//AES block 8k+13 - round 11

.L192_enc_tail:	//TAIL

	ldp	q20, q21, [x3, #128]			//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	sub	x5, x4, x0 	//main_end_input_ptr is number of bytes left to process

	ldr	q8, [x0], #16				//AES block 8k+8 - l3ad plaintext

	ldp	q24, q25, [x3, #192]			//load h8k | h7k
	ext	v25.16b, v25.16b, v25.16b, #8

	mov	v29.16b, v26.16b

	ldp	q22, q23, [x3, #160]			//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8
	ext	v23.16b, v23.16b, v23.16b, #8
	cmp	x5, #112

.inst	0xce007509	//eor3 v9.16b, v8.16b, v0.16b, v29.16b			//AES block 8k+8 - result
	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
	b.gt	.L192_enc_blocks_more_than_7

	cmp	x5, #96
	mov	v7.16b, v6.16b
	movi	v17.8b, #0

	mov	v6.16b, v5.16b
	movi	v19.8b, #0
	sub	v30.4s, v30.4s, v31.4s

	mov	v5.16b, v4.16b
	mov	v4.16b, v3.16b
	mov	v3.16b, v2.16b

	mov	v2.16b, v1.16b
	movi	v18.8b, #0
	b.gt	.L192_enc_blocks_more_than_6

	mov	v7.16b, v6.16b
	cmp	x5, #80

	mov	v6.16b, v5.16b
	mov	v5.16b, v4.16b
	mov	v4.16b, v3.16b

	mov	v3.16b, v1.16b
	sub	v30.4s, v30.4s, v31.4s
	b.gt	.L192_enc_blocks_more_than_5

	cmp	x5, #64
	sub	v30.4s, v30.4s, v31.4s

	mov	v7.16b, v6.16b
	mov	v6.16b, v5.16b
	mov	v5.16b, v4.16b

	mov	v4.16b, v1.16b
	b.gt	.L192_enc_blocks_more_than_4

	mov	v7.16b, v6.16b
	mov	v6.16b, v5.16b
	mov	v5.16b, v1.16b

	sub	v30.4s, v30.4s, v31.4s
	cmp	x5, #48
	b.gt	.L192_enc_blocks_more_than_3

	mov	v7.16b, v6.16b
	mov	v6.16b, v1.16b
	sub	v30.4s, v30.4s, v31.4s

	ldr	q24, [x3, #96]				//load h4k | h3k
	cmp	x5, #32
	b.gt	.L192_enc_blocks_more_than_2

	sub	v30.4s, v30.4s, v31.4s

	cmp	x5, #16
	mov	v7.16b, v1.16b
	b.gt	.L192_enc_blocks_more_than_1

	sub	v30.4s, v30.4s, v31.4s
	ldr	q21, [x3, #48]				//load h2k | h1k
	b	.L192_enc_blocks_less_than_1
.L192_enc_blocks_more_than_7:	//blocks	left >  7
	st1	{ v9.16b}, [x2], #16			 	//AES final-7 block  - store result

	rev64	v8.16b, v9.16b						//GHASH final-7 block
	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid

	ldr	q9, [x0], #16				//AES final-6 block - load plaintext

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
	movi	v16.8b, #0						//suppress further partial tag feed in
	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low

	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high

	pmull	v18.1q, v27.1d, v18.1d			 	//GHASH final-7 block - mid
.inst	0xce017529	//eor3 v9.16b, v9.16b, v1.16b, v29.16b			//AES final-6 block - result
.L192_enc_blocks_more_than_6:	//blocks	left >  6

	st1	{ v9.16b}, [x2], #16			 	//AES final-6 block - store result

	rev64	v8.16b, v9.16b						//GHASH final-6 block

	ldr	q9, [x0], #16				//AES final-5 block - load plaintext

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid

	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
.inst	0xce027529	//eor3 v9.16b, v9.16b, v2.16b, v29.16b			//AES final-5 block - result

	movi	v16.8b, #0						//suppress further partial tag feed in
	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high
	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid

	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
.L192_enc_blocks_more_than_5:	//blocks	left >  5

	st1	{ v9.16b}, [x2], #16			 	//AES final-5 block - store result

	rev64	v8.16b, v9.16b						//GHASH final-5 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid

	ldr	q9, [x0], #16				//AES final-4 block - load plaintext
	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high

	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low
	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid

.inst	0xce037529	//eor3 v9.16b, v9.16b, v3.16b, v29.16b			//AES final-4 block - result
	movi	v16.8b, #0						//suppress further partial tag feed in

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
.L192_enc_blocks_more_than_4:	//blocks	left >  4

	st1	{ v9.16b}, [x2], #16				//AES final-4 block - store result

	rev64	v8.16b, v9.16b						//GHASH final-4 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ldr	q9, [x0], #16				//AES final-3 block - load plaintext
	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high
	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid

	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid

	movi	v16.8b, #0						//suppress further partial tag feed in
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low

	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
.inst	0xce047529	//eor3 v9.16b, v9.16b, v4.16b, v29.16b			//AES final-3 block - result
.L192_enc_blocks_more_than_3:	//blocks	left >  3

	ldr	q24, [x3, #96]				//load h4k | h3k
	st1	{ v9.16b}, [x2], #16			 	//AES final-3 block - store result

	rev64	v8.16b, v9.16b						//GHASH final-3 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
	movi	v16.8b, #0						//suppress further partial tag feed in

	ldr	q9, [x0], #16				//AES final-2 block - load plaintext
	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8

	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid

.inst	0xce057529	//eor3 v9.16b, v9.16b, v5.16b, v29.16b			//AES final-2 block - result
	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid

	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low

	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high
	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
.L192_enc_blocks_more_than_2:	//blocks	left >  2

	st1	{ v9.16b}, [x2], #16			 	//AES final-2 block - store result

	rev64	v8.16b, v9.16b						//GHASH final-2 block
	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ldr	q9, [x0], #16				//AES final-1 block - load plaintext
	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid

	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
	movi	v16.8b, #0						//suppress further partial tag feed in

	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
.inst	0xce067529	//eor3 v9.16b, v9.16b, v6.16b, v29.16b			//AES final-1 block - result
.L192_enc_blocks_more_than_1:	//blocks	left >  1

	ldr	q22, [x3, #64]				//load h1l | h1h
	ext	v22.16b, v22.16b, v22.16b, #8
	st1	{ v9.16b}, [x2], #16			 	//AES final-1 block - store result

	rev64	v8.16b, v9.16b						//GHASH final-1 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high
	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid

	ldr	q9, [x0], #16				//AES final block - load plaintext
	ldr	q21, [x3, #48]				//load h2k | h1k

	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid

.inst	0xce077529	//eor3 v9.16b, v9.16b, v7.16b, v29.16b			//AES final block - result
	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid

	movi	v16.8b, #0						//suppress further partial tag feed in

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
.L192_enc_blocks_less_than_1:	//blocks	left <= 1

	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
	and	x1, x1, #127				//bit_length %= 128

	sub	x1, x1, #128				//bit_length -= 128

	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])

	and	x1, x1, #127				//bit_length %= 128

	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
	cmp	x1, #64
	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff

	csel	x13, x7, x6, lt
	csel	x14, x6, xzr, lt

	mov	v0.d[1], x14
	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8

	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
	mov	v0.d[0], x13					//ctr0b is mask for last block

	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits

	rev64	v8.16b, v9.16b						//GHASH final block
	bif	v9.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing

	st1	{ v9.16b}, [x2]				//store all 16B

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high

	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low

	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid

	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid

	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
	ldr	d16, [x10]			//MODULO - load modulo constant

	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low
	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment

	rev32	v30.16b, v30.16b

	str	q30, [x16]					//store the updated counter
.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up

	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid

.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid

	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment

.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
	ext	v19.16b, v19.16b, v19.16b, #8
	rev64	v19.16b, v19.16b
	st1	{ v19.16b }, [x3]

	mov	x0, x9					//return sizes

	ldp	d10, d11, [sp, #16]
	ldp	d12, d13, [sp, #32]
	ldp	d14, d15, [sp, #48]
	ldp	d8, d9, [sp], #80
	ret

.L192_enc_ret:
	mov	w0, #0x0
	ret
.size	unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
.globl	unroll8_eor3_aes_gcm_dec_192_kernel
.type	unroll8_eor3_aes_gcm_dec_192_kernel,%function
.align	4
unroll8_eor3_aes_gcm_dec_192_kernel:
	AARCH64_VALID_CALL_TARGET
	cbz	x1, .L192_dec_ret
	stp	d8, d9, [sp, #-80]!
	lsr	x9, x1, #3
	mov	x16, x4
	mov	x8, x5
	stp	d10, d11, [sp, #16]
	stp	d12, d13, [sp, #32]
	stp	d14, d15, [sp, #48]
	mov	x5, #0xc200000000000000
	stp	x5, xzr, [sp, #64]
	add	x10, sp, #64

	mov	x5, x9
	ld1	{ v0.16b}, [x16]					//CTR block 0
	ld1	{ v19.16b}, [x3]

	mov	x15, #0x100000000			//set up counter increment
	movi	v31.16b, #0x0
	mov	v31.d[1], x15

	rev32	v30.16b, v0.16b				//set up reversed counter

	add	v30.4s, v30.4s, v31.4s		//CTR block 0

	rev32	v1.16b, v30.16b				//CTR block 1
	add	v30.4s, v30.4s, v31.4s		//CTR block 1

	rev32	v2.16b, v30.16b				//CTR block 2
	add	v30.4s, v30.4s, v31.4s		//CTR block 2

	rev32	v3.16b, v30.16b				//CTR block 3
	add	v30.4s, v30.4s, v31.4s		//CTR block 3

	rev32	v4.16b, v30.16b				//CTR block 4
	add	v30.4s, v30.4s, v31.4s		//CTR block 4

	rev32	v5.16b, v30.16b				//CTR block 5
	add	v30.4s, v30.4s, v31.4s		//CTR block 5
	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1

	rev32	v6.16b, v30.16b				//CTR block 6
	add	v30.4s, v30.4s, v31.4s		//CTR block 6

	rev32	v7.16b, v30.16b				//CTR block 7

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 0
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 0

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 0

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 0
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
	ldp	q28, q26, [x8, #32]				//load rk2, rk3

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 1

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 1

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 1
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 1
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 1

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 1

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 1

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 2

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 2

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 3

	ldp	q27, q28, [x8, #64]				//load rk4, rk5
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 3
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 3

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 3
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 3

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 3
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 3

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 4

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 4

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 4

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
	ldp	q26, q27, [x8, #96]				//load rk6, rk7

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 5

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 5

	sub	x5, x5, #1		//byte_len - 1

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 6

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 6

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
	ldp	q28, q26, [x8, #128]				//load rk8, rk9

	add	v30.4s, v30.4s, v31.4s		//CTR block 7

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 7

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 7

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 7
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 7

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 8
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 8
	and	x5, x5, #0xffffffffffffff80	//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 8
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 8
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 8

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 8
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 8
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 8

	add	x4, x0, x1, lsr #3		//end_input_ptr
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 9

	ld1	{ v19.16b}, [x3]
	ext	v19.16b, v19.16b, v19.16b, #8
	rev64	v19.16b, v19.16b

	ldp	q27, q28, [x8, #160]				//load rk10, rk11

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 9
	add	x5, x5, x0

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 9
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 9
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 9

	cmp	x0, x5				//check if we have <= 8 blocks
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 9

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 9
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 9

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 10
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 10
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 10

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 10
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 10
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 10

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 10
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 10
	ldr	q26, [x8, #192]					//load rk12

	aese	v0.16b, v28.16b						//AES block 0 - round 11
	aese	v1.16b, v28.16b						//AES block 1 - round 11
	aese	v4.16b, v28.16b						//AES block 4 - round 11

	aese	v6.16b, v28.16b						//AES block 6 - round 11
	aese	v5.16b, v28.16b						//AES block 5 - round 11
	aese	v7.16b, v28.16b						//AES block 7 - round 11

	aese	v2.16b, v28.16b						//AES block 2 - round 11
	aese	v3.16b, v28.16b						//AES block 3 - round 11
	b.ge	.L192_dec_tail						//handle tail

	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load ciphertext

	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load ciphertext

	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load ciphertext

.inst	0xce016921	//eor3 v1.16b, v9.16b, v1.16b, v26.16b				//AES block 1 - result
.inst	0xce006900	//eor3 v0.16b, v8.16b, v0.16b, v26.16b				//AES block 0 - result
	stp	q0, q1, [x2], #32			//AES block 0, 1 - store result

	rev32	v0.16b, v30.16b				//CTR block 8
	add	v30.4s, v30.4s, v31.4s		//CTR block 8

	rev32	v1.16b, v30.16b				//CTR block 9
	add	v30.4s, v30.4s, v31.4s		//CTR block 9
.inst	0xce036963	//eor3 v3.16b, v11.16b, v3.16b, v26.16b				//AES block 3 - result

.inst	0xce026942	//eor3 v2.16b, v10.16b, v2.16b, v26.16b				//AES block 2 - result
	stp	q2, q3, [x2], #32			//AES block 2, 3 - store result
	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load ciphertext

	rev32	v2.16b, v30.16b				//CTR block 10
	add	v30.4s, v30.4s, v31.4s		//CTR block 10

.inst	0xce046984	//eor3 v4.16b, v12.16b, v4.16b, v26.16b				//AES block 4 - result

	rev32	v3.16b, v30.16b				//CTR block 11
	add	v30.4s, v30.4s, v31.4s		//CTR block 11

.inst	0xce0569a5	//eor3 v5.16b, v13.16b, v5.16b, v26.16b				//AES block 5 - result
	stp	q4, q5, [x2], #32			//AES block 4, 5 - store result
	cmp	x0, x5				//check if we have <= 8 blocks

.inst	0xce0669c6	//eor3 v6.16b, v14.16b, v6.16b, v26.16b				//AES block 6 - result
.inst	0xce0769e7	//eor3 v7.16b, v15.16b, v7.16b, v26.16b				//AES block 7 - result
	rev32	v4.16b, v30.16b				//CTR block 12

	add	v30.4s, v30.4s, v31.4s		//CTR block 12
	stp	q6, q7, [x2], #32			//AES block 6, 7 - store result
	b.ge	.L192_dec_prepretail					//do prepretail

.L192_dec_main_loop:	//main	loop start
	rev64	v9.16b, v9.16b						//GHASH block 8k+1
	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0

	rev64	v8.16b, v8.16b						//GHASH block 8k
	rev32	v5.16b, v30.16b				//CTR block 8k+13
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13

	ldr	q23, [x3, #176]				//load h7l | h7h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #208]				//load h8l | h8h
	ext	v25.16b, v25.16b, v25.16b, #8
	rev64	v12.16b, v12.16b						//GHASH block 8k+4
	rev64	v11.16b, v11.16b						//GHASH block 8k+3

	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
	rev32	v6.16b, v30.16b				//CTR block 8k+14
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14

	rev64	v13.16b, v13.16b						//GHASH block 8k+5

	rev32	v7.16b, v30.16b				//CTR block 8k+15
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0

	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
	ldp	q28, q26, [x8, #32]				//load rk2, rk3

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
	ldr	q20, [x3, #128]				//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #160]				//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1

	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1

	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	rev64	v10.16b, v10.16b						//GHASH block 8k+2
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
	ldr	q21, [x3, #144]				//load h6k | h5k
	ldr	q24, [x3, #192]				//load h8k | h7k
	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid

	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high

	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2

	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3

	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
	ldp	q27, q28, [x8, #64]				//load rk4, rk5

	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3

	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15

	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4

	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #64]				//load h2l | h2h
	ext	v22.16b, v22.16b, v22.16b, #8
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5

	ldp	q26, q27, [x8, #96]				//load rk6, rk7
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
	rev64	v15.16b, v15.16b						//GHASH block 8k+7

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5

	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
	rev64	v14.16b, v14.16b						//GHASH block 8k+6

	ldr	q21, [x3, #48]				//load h2k | h1k
	ldr	q24, [x3, #96]				//load h4k | h3k
	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6

	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6

	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low

	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
	ldp	q28, q26, [x8, #128]				//load rk8, rk9
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7

	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7

.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high

	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
	ldr	d16, [x10]			//MODULO - load modulo constant
	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8

.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
	rev32	v20.16b, v30.16b					//CTR block 8k+16
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
	ldp	q27, q28, [x8, #160]				//load rk10, rk11

.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load ciphertext

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load ciphertext

	rev32	v22.16b, v30.16b					//CTR block 8k+17
	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load ciphertext

	rev32	v23.16b, v30.16b					//CTR block 8k+18
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
	ldr	q26, [x8, #192]					//load rk12

	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load ciphertext
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10

	aese	v0.16b, v28.16b						//AES block 8k+8 - round 11
	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
	aese	v1.16b, v28.16b						//AES block 8k+9 - round 11

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
	aese	v6.16b, v28.16b						//AES block 8k+14 - round 11
	aese	v3.16b, v28.16b						//AES block 8k+11 - round 11

.inst	0xce006900	//eor3 v0.16b, v8.16b, v0.16b, v26.16b				//AES block 8k+8 - result
	rev32	v25.16b, v30.16b					//CTR block 8k+19
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10

	aese	v4.16b, v28.16b						//AES block 8k+12 - round 11
	aese	v2.16b, v28.16b						//AES block 8k+10 - round 11
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19

	aese	v7.16b, v28.16b						//AES block 8k+15 - round 11
	aese	v5.16b, v28.16b						//AES block 8k+13 - round 11
	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low

.inst	0xce016921	//eor3 v1.16b, v9.16b, v1.16b, v26.16b				//AES block 8k+9 - result
	stp	q0, q1, [x2], #32			//AES block 8k+8, 8k+9 - store result
.inst	0xce036963	//eor3 v3.16b, v11.16b, v3.16b, v26.16b				//AES block 8k+11 - result

.inst	0xce026942	//eor3 v2.16b, v10.16b, v2.16b, v26.16b				//AES block 8k+10 - result
.inst	0xce0769e7	//eor3 v7.16b, v15.16b, v7.16b, v26.16b				//AES block 8k+15 - result
	stp	q2, q3, [x2], #32			//AES block 8k+10, 8k+11 - store result

.inst	0xce0569a5	//eor3 v5.16b, v13.16b, v5.16b, v26.16b				//AES block 8k+13 - result
.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
	mov	v3.16b, v25.16b					//CTR block 8k+19

.inst	0xce046984	//eor3 v4.16b, v12.16b, v4.16b, v26.16b				//AES block 8k+12 - result
	stp	q4, q5, [x2], #32			//AES block 8k+12, 8k+13 - store result
	cmp	x0, x5				//.LOOP CONTROL

.inst	0xce0669c6	//eor3 v6.16b, v14.16b, v6.16b, v26.16b				//AES block 8k+14 - result
	stp	q6, q7, [x2], #32			//AES block 8k+14, 8k+15 - store result
	mov	v0.16b, v20.16b					//CTR block 8k+16

	mov	v1.16b, v22.16b					//CTR block 8k+17
	mov	v2.16b, v23.16b					//CTR block 8k+18

	rev32	v4.16b, v30.16b				//CTR block 8k+20
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
	b.lt	.L192_dec_main_loop

.L192_dec_prepretail:	//PREPRETAIL
	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
	rev32	v5.16b, v30.16b				//CTR block 8k+13
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13

	ldr	q23, [x3, #176]				//load h7l | h7h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #208]				//load h8l | h8h
	ext	v25.16b, v25.16b, v25.16b, #8
	rev64	v8.16b, v8.16b						//GHASH block 8k
	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0

	rev64	v11.16b, v11.16b						//GHASH block 8k+3
	rev32	v6.16b, v30.16b				//CTR block 8k+14
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14

	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
	rev64	v10.16b, v10.16b						//GHASH block 8k+2
	rev64	v9.16b, v9.16b						//GHASH block 8k+1

	ldr	q20, [x3, #128]				//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #160]				//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8
	rev32	v7.16b, v30.16b				//CTR block 8k+15

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
	ldp	q28, q26, [x8, #32]				//load rk2, rk3

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low

	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1

	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1

	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1

	ldr	q21, [x3, #144]				//load h6k | h5k
	ldr	q24, [x3, #192]				//load h8k | h7k
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
	rev64	v13.16b, v13.16b						//GHASH block 8k+5
	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low

.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2

	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid

	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3

.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
	ldp	q27, q28, [x8, #64]				//load rk4, rk5
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3

	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8
	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid

	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #64]				//load h2l | h2h
	ext	v22.16b, v22.16b, v22.16b, #8
	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3

	rev64	v15.16b, v15.16b						//GHASH block 8k+7

.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
	rev64	v12.16b, v12.16b						//GHASH block 8k+4

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4

	rev64	v14.16b, v14.16b						//GHASH block 8k+6
	ldr	q21, [x3, #48]				//load h2k | h1k
	ldr	q24, [x3, #96]				//load h4k | h3k
	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5

	ldp	q26, q27, [x8, #96]				//load rk6, rk7
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5

	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5

	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high

	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5

	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6

	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6

	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6

	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high

.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7

	ldp	q28, q26, [x8, #128]				//load rk8, rk9
	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7

	ldr	d16, [x10]			//MODULO - load modulo constant
.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7

.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7

.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
	ldp	q27, q28, [x8, #160]				//load rk10, rk11

.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9

	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
	ldr	q26, [x8, #192]					//load rk12
	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10

	aese	v0.16b, v28.16b						//AES block 8k+8 - round 11
.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
	aese	v5.16b, v28.16b						//AES block 8k+13 - round 11

	aese	v2.16b, v28.16b						//AES block 8k+10 - round 11
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10

	aese	v6.16b, v28.16b						//AES block 8k+14 - round 11
	aese	v4.16b, v28.16b						//AES block 8k+12 - round 11
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15

	aese	v3.16b, v28.16b						//AES block 8k+11 - round 11
	aese	v1.16b, v28.16b						//AES block 8k+9 - round 11
	aese	v7.16b, v28.16b						//AES block 8k+15 - round 11

.L192_dec_tail:	//TAIL

	sub	x5, x4, x0 	//main_end_input_ptr is number of bytes left to process

	ldp	q20, q21, [x3, #128]			//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q9, [x0], #16				//AES block 8k+8 - load ciphertext

	ldp	q24, q25, [x3, #192]			//load h8k | h7k
	ext	v25.16b, v25.16b, v25.16b, #8

	mov	v29.16b, v26.16b

	ldp	q22, q23, [x3, #160]			//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8
	ext	v23.16b, v23.16b, v23.16b, #8
	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag

.inst	0xce00752c	//eor3 v12.16b, v9.16b, v0.16b, v29.16b				//AES block 8k+8 - result
	cmp	x5, #112
	b.gt	.L192_dec_blocks_more_than_7

	mov	v7.16b, v6.16b
	movi	v17.8b, #0
	sub	v30.4s, v30.4s, v31.4s

	mov	v6.16b, v5.16b
	mov	v5.16b, v4.16b
	mov	v4.16b, v3.16b

	cmp	x5, #96
	movi	v19.8b, #0
	mov	v3.16b, v2.16b

	mov	v2.16b, v1.16b
	movi	v18.8b, #0
	b.gt	.L192_dec_blocks_more_than_6

	mov	v7.16b, v6.16b
	mov	v6.16b, v5.16b
	mov	v5.16b, v4.16b

	mov	v4.16b, v3.16b
	mov	v3.16b, v1.16b

	sub	v30.4s, v30.4s, v31.4s
	cmp	x5, #80
	b.gt	.L192_dec_blocks_more_than_5

	mov	v7.16b, v6.16b
	mov	v6.16b, v5.16b

	mov	v5.16b, v4.16b
	mov	v4.16b, v1.16b
	cmp	x5, #64

	sub	v30.4s, v30.4s, v31.4s
	b.gt	.L192_dec_blocks_more_than_4

	sub	v30.4s, v30.4s, v31.4s
	mov	v7.16b, v6.16b
	mov	v6.16b, v5.16b

	mov	v5.16b, v1.16b
	cmp	x5, #48
	b.gt	.L192_dec_blocks_more_than_3

	sub	v30.4s, v30.4s, v31.4s
	mov	v7.16b, v6.16b
	cmp	x5, #32

	mov	v6.16b, v1.16b
	ldr	q24, [x3, #96]				//load h4k | h3k
	b.gt	.L192_dec_blocks_more_than_2

	sub	v30.4s, v30.4s, v31.4s

	mov	v7.16b, v1.16b
	cmp	x5, #16
	b.gt	.L192_dec_blocks_more_than_1

	sub	v30.4s, v30.4s, v31.4s
	ldr	q21, [x3, #48]				//load h2k | h1k
	b	.L192_dec_blocks_less_than_1
.L192_dec_blocks_more_than_7:	//blocks	left >  7
	rev64	v8.16b, v9.16b						//GHASH final-7 block

	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid
	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
	ldr	q9, [x0], #16				//AES final-6 block - load ciphertext

	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
	st1	{ v12.16b}, [x2], #16			 	//AES final-7 block  - store result

.inst	0xce01752c	//eor3 v12.16b, v9.16b, v1.16b, v29.16b				//AES final-6 block - result

	pmull	v18.1q, v27.1d, v18.1d			 	//GHASH final-7 block - mid
	movi	v16.8b, #0						//suppress further partial tag feed in
.L192_dec_blocks_more_than_6:	//blocks	left >  6

	rev64	v8.16b, v9.16b						//GHASH final-6 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ldr	q9, [x0], #16				//AES final-5 block - load ciphertext
	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid
	movi	v16.8b, #0						//suppress further partial tag feed in
	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high

	st1	{ v12.16b}, [x2], #16			 	//AES final-6 block - store result
.inst	0xce02752c	//eor3 v12.16b, v9.16b, v2.16b, v29.16b				//AES final-5 block - result

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
.L192_dec_blocks_more_than_5:	//blocks	left >  5

	rev64	v8.16b, v9.16b						//GHASH final-5 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid

	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high

	ldr	q9, [x0], #16				//AES final-4 block - load ciphertext

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low

	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low
	movi	v16.8b, #0						//suppress further partial tag feed in
	st1	{ v12.16b}, [x2], #16			 	//AES final-5 block - store result

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
.inst	0xce03752c	//eor3 v12.16b, v9.16b, v3.16b, v29.16b				//AES final-4 block - result
.L192_dec_blocks_more_than_4:	//blocks	left >  4

	rev64	v8.16b, v9.16b						//GHASH final-4 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
	movi	v16.8b, #0						//suppress further partial tag feed in

	ldr	q9, [x0], #16				//AES final-3 block - load ciphertext
	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low

	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid
	st1	{ v12.16b}, [x2], #16			 	//AES final-4 block - store result
	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high

.inst	0xce04752c	//eor3 v12.16b, v9.16b, v4.16b, v29.16b				//AES final-3 block - result

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
.L192_dec_blocks_more_than_3:	//blocks	left >  3

	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8
	rev64	v8.16b, v9.16b						//GHASH final-3 block
	ldr	q9, [x0], #16				//AES final-2 block - load ciphertext

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
	movi	v16.8b, #0						//suppress further partial tag feed in
	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low

	st1	{ v12.16b}, [x2], #16			 	//AES final-3 block - store result
	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid
.inst	0xce05752c	//eor3 v12.16b, v9.16b, v5.16b, v29.16b				//AES final-2 block - result

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low
	ldr	q24, [x3, #96]				//load h4k | h3k

	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid

	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
.L192_dec_blocks_more_than_2:	//blocks	left >  2

	rev64	v8.16b, v9.16b						//GHASH final-2 block
	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid
	ldr	q9, [x0], #16				//AES final-1 block - load ciphertext

	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low

	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
	movi	v16.8b, #0						//suppress further partial tag feed in

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
	st1	{ v12.16b}, [x2], #16			 	//AES final-2 block - store result

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
.inst	0xce06752c	//eor3 v12.16b, v9.16b, v6.16b, v29.16b				//AES final-1 block - result
.L192_dec_blocks_more_than_1:	//blocks	left >  1

	rev64	v8.16b, v9.16b						//GHASH final-1 block
	ldr	q9, [x0], #16				//AES final block - load ciphertext
	ldr	q22, [x3, #64]				//load h1l | h1h
	ext	v22.16b, v22.16b, v22.16b, #8

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
	movi	v16.8b, #0						//suppress further partial tag feed in
	ldr	q21, [x3, #48]				//load h2k | h1k

	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
	st1	{ v12.16b}, [x2], #16			 	//AES final-1 block - store result

	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high

.inst	0xce07752c	//eor3 v12.16b, v9.16b, v7.16b, v29.16b				//AES final block - result

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid

	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid

	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
.L192_dec_blocks_less_than_1:	//blocks	left <= 1

	rev32	v30.16b, v30.16b
	and	x1, x1, #127				//bit_length %= 128

	sub	x1, x1, #128				//bit_length -= 128
	str	q30, [x16]					//store the updated counter

	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])
	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff

	and	x1, x1, #127				//bit_length %= 128

	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff
	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
	cmp	x1, #64

	csel	x13, x7, x6, lt
	csel	x14, x6, xzr, lt
	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8

	mov	v0.d[1], x14
	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored

	mov	v0.d[0], x13					//ctr0b is mask for last block

	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits
	bif	v12.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing

	rev64	v8.16b, v9.16b						//GHASH final block

	st1	{ v12.16b}, [x2]				//store all 16B

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low

	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid
	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low

	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid
	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high

	eor	v14.16b, v17.16b, v19.16b				//MODULO - karatsuba tidy up
	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
	ldr	d16, [x10]			//MODULO - load modulo constant

	pmull	v21.1q, v17.1d, v16.1d			//MODULO - top 64b align with mid
	ext	v17.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment

	eor	v18.16b, v18.16b, v14.16b				//MODULO - karatsuba tidy up

.inst	0xce115652	//eor3 v18.16b, v18.16b, v17.16b, v21.16b			//MODULO - fold into mid

	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
	ext	v18.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment

.inst	0xce124673	//eor3 v19.16b, v19.16b, v18.16b, v17.16b			//MODULO - fold into low
	ext	v19.16b, v19.16b, v19.16b, #8
	rev64	v19.16b, v19.16b
	st1	{ v19.16b }, [x3]

	mov	x0, x9

	ldp	d10, d11, [sp, #16]
	ldp	d12, d13, [sp, #32]
	ldp	d14, d15, [sp, #48]
	ldp	d8, d9, [sp], #80
	ret

.L192_dec_ret:
	mov	w0, #0x0
	ret
.size	unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
.globl	unroll8_eor3_aes_gcm_enc_256_kernel
.type	unroll8_eor3_aes_gcm_enc_256_kernel,%function
.align	4
unroll8_eor3_aes_gcm_enc_256_kernel:
	AARCH64_VALID_CALL_TARGET
	cbz	x1, .L256_enc_ret
	stp	d8, d9, [sp, #-80]!
	lsr	x9, x1, #3
	mov	x16, x4
	mov	x8, x5
	stp	d10, d11, [sp, #16]
	stp	d12, d13, [sp, #32]
	stp	d14, d15, [sp, #48]
	mov	x5, #0xc200000000000000
	stp	x5, xzr, [sp, #64]
	add	x10, sp, #64

	ld1	{ v0.16b}, [x16]					//CTR block 0

	mov	x5, x9

	mov	x15, #0x100000000			//set up counter increment
	movi	v31.16b, #0x0
	mov	v31.d[1], x15
	sub	x5, x5, #1		//byte_len - 1

	and	x5, x5, #0xffffffffffffff80	//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)

	add	x5, x5, x0

	rev32	v30.16b, v0.16b				//set up reversed counter

	add	v30.4s, v30.4s, v31.4s		//CTR block 0

	rev32	v1.16b, v30.16b				//CTR block 1
	add	v30.4s, v30.4s, v31.4s		//CTR block 1

	rev32	v2.16b, v30.16b				//CTR block 2
	add	v30.4s, v30.4s, v31.4s		//CTR block 2

	rev32	v3.16b, v30.16b				//CTR block 3
	add	v30.4s, v30.4s, v31.4s		//CTR block 3

	rev32	v4.16b, v30.16b				//CTR block 4
	add	v30.4s, v30.4s, v31.4s		//CTR block 4

	rev32	v5.16b, v30.16b				//CTR block 5
	add	v30.4s, v30.4s, v31.4s		//CTR block 5
	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1

	rev32	v6.16b, v30.16b				//CTR block 6
	add	v30.4s, v30.4s, v31.4s		//CTR block 6

	rev32	v7.16b, v30.16b				//CTR block 7

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 0

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 0

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 0
	ldp	q28, q26, [x8, #32]				//load rk2, rk3

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 1
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 1

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 1
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 1

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 1

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 1

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 1

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 2

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 2

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
	ldp	q27, q28, [x8, #64]				//load rk4, rk5

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 3

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 3
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 3

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 3
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 3

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 4

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 4

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 4

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
	ldp	q26, q27, [x8, #96]				//load rk6, rk7

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 5

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 5

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 6

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 6

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
	ldp	q28, q26, [x8, #128]				//load rk8, rk9

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 7

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 7

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 7

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 7

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 8
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 8

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 8
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 8
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 8

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 8
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 8
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 8

	ld1	{ v19.16b}, [x3]
	ext	v19.16b, v19.16b, v19.16b, #8
	rev64	v19.16b, v19.16b
	ldp	q27, q28, [x8, #160]				//load rk10, rk11

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 9
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 9
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 9

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 9
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 9
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 9

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 9

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 10
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 10
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 9

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 10
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 10
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 10

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 10
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 10
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 10

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 11
	ldp	q26, q27, [x8, #192]				//load rk12, rk13
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 11

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 11
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 11
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 11

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 11
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 11
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 11

	add	v30.4s, v30.4s, v31.4s		//CTR block 7
	ldr	q28, [x8, #224]					//load rk14

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 12
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 12
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 12

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 12
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 12
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 12

	aese	v2.16b, v27.16b						//AES block 2 - round 13
	aese	v1.16b, v27.16b						//AES block 1 - round 13
	aese	v4.16b, v27.16b						//AES block 4 - round 13

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 12
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 12

	aese	v0.16b, v27.16b						//AES block 0 - round 13
	aese	v5.16b, v27.16b						//AES block 5 - round 13

	aese	v6.16b, v27.16b						//AES block 6 - round 13
	aese	v7.16b, v27.16b						//AES block 7 - round 13
	aese	v3.16b, v27.16b						//AES block 3 - round 13

	add	x4, x0, x1, lsr #3		//end_input_ptr
	cmp	x0, x5				//check if we have <= 8 blocks
	b.ge	.L256_enc_tail						//handle tail

	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load plaintext

	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load plaintext

.inst	0xce007108	//eor3 v8.16b, v8.16b, v0.16b, v28.16b				//AES block 0 - result
	rev32	v0.16b, v30.16b				//CTR block 8
	add	v30.4s, v30.4s, v31.4s		//CTR block 8

.inst	0xce017129	//eor3 v9.16b, v9.16b, v1.16b, v28.16b				//AES block 1 - result
.inst	0xce03716b	//eor3 v11.16b, v11.16b, v3.16b, v28.16b				//AES block 3 - result

	rev32	v1.16b, v30.16b				//CTR block 9
	add	v30.4s, v30.4s, v31.4s		//CTR block 9
	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load plaintext

	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load plaintext
.inst	0xce02714a	//eor3 v10.16b, v10.16b, v2.16b, v28.16b				//AES block 2 - result
	cmp	x0, x5				//check if we have <= 8 blocks

	rev32	v2.16b, v30.16b				//CTR block 10
	add	v30.4s, v30.4s, v31.4s		//CTR block 10
	stp	q8, q9, [x2], #32			//AES block 0, 1 - store result

	stp	q10, q11, [x2], #32			//AES block 2, 3 - store result

	rev32	v3.16b, v30.16b				//CTR block 11
	add	v30.4s, v30.4s, v31.4s		//CTR block 11

.inst	0xce04718c	//eor3 v12.16b, v12.16b, v4.16b, v28.16b				//AES block 4 - result

.inst	0xce0771ef	//eor3 v15.16b, v15.16b, v7.16b, v28.16b				//AES block 7 - result
.inst	0xce0671ce	//eor3 v14.16b, v14.16b, v6.16b, v28.16b				//AES block 6 - result
.inst	0xce0571ad	//eor3 v13.16b, v13.16b, v5.16b, v28.16b				//AES block 5 - result

	stp	q12, q13, [x2], #32			//AES block 4, 5 - store result
	rev32	v4.16b, v30.16b				//CTR block 12

	stp	q14, q15, [x2], #32			//AES block 6, 7 - store result
	add	v30.4s, v30.4s, v31.4s		//CTR block 12
	b.ge	.L256_enc_prepretail					//do prepretail

.L256_enc_main_loop:	//main	loop start
	ldp	q26, q27, [x8, #0]					//load rk0, rk1

	rev32	v5.16b, v30.16b				//CTR block 8k+13
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
	ldr	q21, [x3, #144]				//load h6k | h5k
	ldr	q24, [x3, #192]				//load h8k | h7k

	rev64	v11.16b, v11.16b						//GHASH block 8k+3
	ldr	q20, [x3, #128]				//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #160]				//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8
	rev64	v9.16b, v9.16b						//GHASH block 8k+1

	rev32	v6.16b, v30.16b				//CTR block 8k+14
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
	rev64	v8.16b, v8.16b						//GHASH block 8k

	rev64	v12.16b, v12.16b						//GHASH block 8k+4
	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
	ldr	q23, [x3, #176]				//load h7l | h7h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #208]				//load h8l | h8h
	ext	v25.16b, v25.16b, v25.16b, #8

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
	rev32	v7.16b, v30.16b				//CTR block 8k+15

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0

	ldp	q28, q26, [x8, #32]				//load rk2, rk3
	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1

	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high

	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
	rev64	v14.16b, v14.16b						//GHASH block 8k+6
	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
	ldp	q27, q28, [x8, #64]				//load rk4, rk5
	rev64	v10.16b, v10.16b						//GHASH block 8k+2

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3

	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
	rev64	v13.16b, v13.16b						//GHASH block 8k+5

	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8

	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4

	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4

	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
	ldp	q26, q27, [x8, #96]				//load rk6, rk7

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5

	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
	rev64	v15.16b, v15.16b						//GHASH block 8k+7

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5

	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5

	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6

	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6

.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6

	ldp	q28, q26, [x8, #128]				//load rk8, rk9
	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7

	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #64]				//load h2l | h2h
	ext	v22.16b, v22.16b, v22.16b, #8
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid

	ldr	q21, [x3, #48]				//load h2k | h1k
	ldr	q24, [x3, #96]				//load h4k | h3k
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low

	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7

	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8

	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8

	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8

.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9

	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9

	ldp	q27, q28, [x8, #160]				//load rk10, rk11
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9

	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low

	ldr	d16, [x10]			//MODULO - load modulo constant
	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9

.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10

.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high

	ldp	q26, q27, [x8, #192]				//load rk12, rk13
	rev32	v20.16b, v30.16b					//CTR block 8k+16

	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load plaintext
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 11

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 11
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 11

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 11
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 11

	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 11

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 12
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 11

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 12
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 12
	rev32	v22.16b, v30.16b					//CTR block 8k+17

	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 11
.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 12
	ldr	q28, [x8, #224]					//load rk14
	aese	v7.16b, v27.16b						//AES block 8k+15 - round 13

	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load plaintext
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 12
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 12

.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 12
	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load plaintext

	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load plaintext
	aese	v2.16b, v27.16b						//AES block 8k+10 - round 13
	aese	v4.16b, v27.16b						//AES block 8k+12 - round 13

	rev32	v23.16b, v30.16b					//CTR block 8k+18
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
	aese	v5.16b, v27.16b						//AES block 8k+13 - round 13

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 12
	aese	v3.16b, v27.16b						//AES block 8k+11 - round 13
	cmp	x0, x5				//.LOOP CONTROL

.inst	0xce02714a	//eor3 v10.16b, v10.16b, v2.16b, v28.16b				//AES block 8k+10 - result
	rev32	v25.16b, v30.16b					//CTR block 8k+19
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19

	aese	v0.16b, v27.16b						//AES block 8k+8 - round 13
	aese	v6.16b, v27.16b						//AES block 8k+14 - round 13
.inst	0xce0571ad	//eor3 v13.16b, v13.16b, v5.16b, v28.16b				//AES block 5 - result

	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
	aese	v1.16b, v27.16b						//AES block 8k+9 - round 13

.inst	0xce04718c	//eor3 v12.16b, v12.16b, v4.16b, v28.16b				//AES block 4 - result
	rev32	v4.16b, v30.16b				//CTR block 8k+20
.inst	0xce03716b	//eor3 v11.16b, v11.16b, v3.16b, v28.16b				//AES block 8k+11 - result

	mov	v3.16b, v25.16b					//CTR block 8k+19
.inst	0xce017129	//eor3 v9.16b, v9.16b, v1.16b, v28.16b				//AES block 8k+9 - result
.inst	0xce007108	//eor3 v8.16b, v8.16b, v0.16b, v28.16b				//AES block 8k+8 - result

	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
	stp	q8, q9, [x2], #32			//AES block 8k+8, 8k+9 - store result
	mov	v2.16b, v23.16b					//CTR block 8k+18

.inst	0xce0771ef	//eor3 v15.16b, v15.16b, v7.16b, v28.16b				//AES block 7 - result
.inst	0xce154673	//eor3 v19.16b, v19.16b, v21.16b, v17.16b		 	//MODULO - fold into low
	stp	q10, q11, [x2], #32			//AES block 8k+10, 8k+11 - store result

.inst	0xce0671ce	//eor3 v14.16b, v14.16b, v6.16b, v28.16b				//AES block 6 - result
	mov	v1.16b, v22.16b					//CTR block 8k+17
	stp	q12, q13, [x2], #32			//AES block 4, 5 - store result

	stp	q14, q15, [x2], #32			//AES block 6, 7 - store result
	mov	v0.16b, v20.16b					//CTR block 8k+16
	b.lt	.L256_enc_main_loop

.L256_enc_prepretail:	//PREPRETAIL
	rev32	v5.16b, v30.16b				//CTR block 8k+13
	ldp	q26, q27, [x8, #0]					//load rk0, rk1
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13

	rev64	v10.16b, v10.16b						//GHASH block 8k+2

	rev32	v6.16b, v30.16b				//CTR block 8k+14
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14

	rev64	v13.16b, v13.16b						//GHASH block 8k+5
	ldr	q21, [x3, #144]				//load h6k | h5k
	ldr	q24, [x3, #192]				//load h8k | h7k

	rev32	v7.16b, v30.16b				//CTR block 8k+15

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0

	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
	rev64	v8.16b, v8.16b						//GHASH block 8k
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1

	rev64	v9.16b, v9.16b						//GHASH block 8k+1
	ldp	q28, q26, [x8, #32]				//load rk2, rk3
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1

	ldr	q23, [x3, #176]				//load h7l | h7h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #208]				//load h8l | h8h
	ext	v25.16b, v25.16b, v25.16b, #8
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1

	ldr	q20, [x3, #128]				//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #160]				//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
	eor	v8.16b, v8.16b, v19.16b					//PRE 1

	rev64	v11.16b, v11.16b						//GHASH block 8k+3
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2

	ldp	q27, q28, [x8, #64]				//load rk4, rk5
	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high

	rev64	v14.16b, v14.16b						//GHASH block 8k+6
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid

	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high

	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3

	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4

	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid

	rev64	v12.16b, v12.16b						//GHASH block 8k+4
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
	ldp	q26, q27, [x8, #96]				//load rk6, rk7

	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8
	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid

.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
	rev64	v15.16b, v15.16b						//GHASH block 8k+7
	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6

	ldr	q21, [x3, #48]				//load h2k | h1k
	ldr	q24, [x3, #96]				//load h4k | h3k
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6

	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #64]				//load h2l | h2h
	ext	v22.16b, v22.16b, v22.16b, #8

	ldp	q28, q26, [x8, #128]				//load rk8, rk9
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7

	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid

	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7

	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8

	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8

	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid

	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high

	ldp	q27, q28, [x8, #160]				//load rk10, rk11
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9

.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
	ldr	d16, [x10]			//MODULO - load modulo constant

.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10

	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 11

	ldp	q26, q27, [x8, #192]				//load rk12, rk13
	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 11

.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 11
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 11

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 11
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 11
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 11

	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 11
	ldr	q28, [x8, #224]					//load rk14

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 12
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 12
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 12

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 12
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 12
	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 12
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 12
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 12
	aese	v0.16b, v27.16b						//AES block 8k+8 - round 13

.inst	0xce154673	//eor3 v19.16b, v19.16b, v21.16b, v17.16b		 	//MODULO - fold into low
	aese	v5.16b, v27.16b						//AES block 8k+13 - round 13
	aese	v1.16b, v27.16b						//AES block 8k+9 - round 13

	aese	v3.16b, v27.16b						//AES block 8k+11 - round 13
	aese	v4.16b, v27.16b						//AES block 8k+12 - round 13
	aese	v7.16b, v27.16b						//AES block 8k+15 - round 13

	aese	v2.16b, v27.16b						//AES block 8k+10 - round 13
	aese	v6.16b, v27.16b						//AES block 8k+14 - round 13
.L256_enc_tail:	//TAIL

	ldp	q24, q25, [x3, #192]			//load h8l | h8h
	ext	v25.16b, v25.16b, v25.16b, #8
	sub	x5, x4, x0		//main_end_input_ptr is number of bytes left to process

	ldr	q8, [x0], #16				//AES block 8k+8 - load plaintext

	ldp	q20, q21, [x3, #128]			//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8

	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
	ldp	q22, q23, [x3, #160]			//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8
	ext	v23.16b, v23.16b, v23.16b, #8
	mov	v29.16b, v28.16b

	cmp	x5, #112
.inst	0xce007509	//eor3 v9.16b, v8.16b, v0.16b, v29.16b				//AES block 8k+8 - result
	b.gt	.L256_enc_blocks_more_than_7

	movi	v19.8b, #0
	mov	v7.16b, v6.16b
	movi	v17.8b, #0

	mov	v6.16b, v5.16b
	mov	v5.16b, v4.16b
	mov	v4.16b, v3.16b

	mov	v3.16b, v2.16b
	sub	v30.4s, v30.4s, v31.4s
	mov	v2.16b, v1.16b

	movi	v18.8b, #0
	cmp	x5, #96
	b.gt	.L256_enc_blocks_more_than_6

	mov	v7.16b, v6.16b
	mov	v6.16b, v5.16b
	cmp	x5, #80

	mov	v5.16b, v4.16b
	mov	v4.16b, v3.16b
	mov	v3.16b, v1.16b

	sub	v30.4s, v30.4s, v31.4s
	b.gt	.L256_enc_blocks_more_than_5

	mov	v7.16b, v6.16b
	sub	v30.4s, v30.4s, v31.4s

	mov	v6.16b, v5.16b
	mov	v5.16b, v4.16b

	cmp	x5, #64
	mov	v4.16b, v1.16b
	b.gt	.L256_enc_blocks_more_than_4

	cmp	x5, #48
	mov	v7.16b, v6.16b
	mov	v6.16b, v5.16b

	mov	v5.16b, v1.16b
	sub	v30.4s, v30.4s, v31.4s
	b.gt	.L256_enc_blocks_more_than_3

	cmp	x5, #32
	mov	v7.16b, v6.16b
	ldr	q24, [x3, #96]				//load h4k | h3k

	mov	v6.16b, v1.16b
	sub	v30.4s, v30.4s, v31.4s
	b.gt	.L256_enc_blocks_more_than_2

	mov	v7.16b, v1.16b

	sub	v30.4s, v30.4s, v31.4s
	cmp	x5, #16
	b.gt	.L256_enc_blocks_more_than_1

	sub	v30.4s, v30.4s, v31.4s
	ldr	q21, [x3, #48]				//load h2k | h1k
	b	.L256_enc_blocks_less_than_1
.L256_enc_blocks_more_than_7:	//blocks	left >  7
	st1	{ v9.16b}, [x2], #16				//AES final-7 block  - store result

	rev64	v8.16b, v9.16b						//GHASH final-7 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ldr	q9, [x0], #16				//AES final-6 block - load plaintext

	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid

	movi	v16.8b, #0						//suppress further partial tag feed in

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
.inst	0xce017529	//eor3 v9.16b, v9.16b, v1.16b, v29.16b			//AES final-6 block - result

	pmull	v18.1q, v27.1d, v18.1d				//GHASH final-7 block - mid
	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
.L256_enc_blocks_more_than_6:	//blocks	left >  6

	st1	{ v9.16b}, [x2], #16				//AES final-6 block - store result

	rev64	v8.16b, v9.16b						//GHASH final-6 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid
	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high

	ldr	q9, [x0], #16				//AES final-5 block - load plaintext

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid

	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
.inst	0xce027529	//eor3 v9.16b, v9.16b, v2.16b, v29.16b			//AES final-5 block - result

	movi	v16.8b, #0						//suppress further partial tag feed in

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
.L256_enc_blocks_more_than_5:	//blocks	left >  5

	st1	{ v9.16b}, [x2], #16				//AES final-5 block - store result

	rev64	v8.16b, v9.16b						//GHASH final-5 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid

	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid

	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid

	ldr	q9, [x0], #16				//AES final-4 block - load plaintext
	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low

	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
	movi	v16.8b, #0						//suppress further partial tag feed in
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
.inst	0xce037529	//eor3 v9.16b, v9.16b, v3.16b, v29.16b			//AES final-4 block - result
.L256_enc_blocks_more_than_4:	//blocks	left >  4

	st1	{ v9.16b}, [x2], #16				//AES final-4 block - store result

	rev64	v8.16b, v9.16b						//GHASH final-4 block

	ldr	q9, [x0], #16				//AES final-3 block - load plaintext

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high

.inst	0xce047529	//eor3 v9.16b, v9.16b, v4.16b, v29.16b			//AES final-3 block - result
	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low

	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid

	movi	v16.8b, #0						//suppress further partial tag feed in

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
.L256_enc_blocks_more_than_3:	//blocks	left >  3

	st1	{ v9.16b}, [x2], #16				//AES final-3 block - store result

	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8
	rev64	v8.16b, v9.16b						//GHASH final-3 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid
	ldr	q24, [x3, #96]				//load h4k | h3k

	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
	ldr	q9, [x0], #16				//AES final-2 block - load plaintext

	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low

.inst	0xce057529	//eor3 v9.16b, v9.16b, v5.16b, v29.16b			//AES final-2 block - result
	movi	v16.8b, #0						//suppress further partial tag feed in

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low
.L256_enc_blocks_more_than_2:	//blocks	left >  2

	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8

	st1	{ v9.16b}, [x2], #16			 	//AES final-2 block - store result

	rev64	v8.16b, v9.16b						//GHASH final-2 block
	ldr	q9, [x0], #16				//AES final-1 block - load plaintext

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid

	movi	v16.8b, #0						//suppress further partial tag feed in

	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
.inst	0xce067529	//eor3 v9.16b, v9.16b, v6.16b, v29.16b			//AES final-1 block - result

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high

	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
.L256_enc_blocks_more_than_1:	//blocks	left >  1

	st1	{ v9.16b}, [x2], #16				//AES final-1 block - store result

	ldr	q22, [x3, #64]				//load h2l | h2h
	ext	v22.16b, v22.16b, v22.16b, #8
	rev64	v8.16b, v9.16b						//GHASH final-1 block
	ldr	q9, [x0], #16				//AES final block - load plaintext

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
	movi	v16.8b, #0						//suppress further partial tag feed in

	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high

.inst	0xce077529	//eor3 v9.16b, v9.16b, v7.16b, v29.16b			//AES final block - result
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high

	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid

	ldr	q21, [x3, #48]				//load h2k | h1k

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid

	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
.L256_enc_blocks_less_than_1:	//blocks	left <= 1

	and	x1, x1, #127				//bit_length %= 128

	sub	x1, x1, #128				//bit_length -= 128

	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])

	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
	and	x1, x1, #127				//bit_length %= 128

	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
	cmp	x1, #64
	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff

	csel	x14, x6, xzr, lt
	csel	x13, x7, x6, lt

	mov	v0.d[0], x13					//ctr0b is mask for last block
	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8

	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
	mov	v0.d[1], x14

	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits

	rev64	v8.16b, v9.16b						//GHASH final block

	rev32	v30.16b, v30.16b
	bif	v9.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing
	str	q30, [x16]					//store the updated counter

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
	st1	{ v9.16b}, [x2]				//store all 16B

	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low

	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low

	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid

	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid

	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
	ldr	d16, [x10]			//MODULO - load modulo constant

	ext	v21.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment

.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
	pmull	v29.1q, v17.1d, v16.1d			//MODULO - top 64b align with mid

.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid

	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment

.inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
	ext	v19.16b, v19.16b, v19.16b, #8
	rev64	v19.16b, v19.16b
	st1	{ v19.16b }, [x3]
	mov	x0, x9					//return sizes

	ldp	d10, d11, [sp, #16]
	ldp	d12, d13, [sp, #32]
	ldp	d14, d15, [sp, #48]
	ldp	d8, d9, [sp], #80
	ret

.L256_enc_ret:
	mov	w0, #0x0
	ret
.size	unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
.globl	unroll8_eor3_aes_gcm_dec_256_kernel
.type	unroll8_eor3_aes_gcm_dec_256_kernel,%function
.align	4
unroll8_eor3_aes_gcm_dec_256_kernel:
	AARCH64_VALID_CALL_TARGET
	cbz	x1, .L256_dec_ret
	stp	d8, d9, [sp, #-80]!
	lsr	x9, x1, #3
	mov	x16, x4
	mov	x8, x5
	stp	d10, d11, [sp, #16]
	stp	d12, d13, [sp, #32]
	stp	d14, d15, [sp, #48]
	mov	x5, #0xc200000000000000
	stp	x5, xzr, [sp, #64]
	add	x10, sp, #64

	ld1	{ v0.16b}, [x16]					//CTR block 0

	mov	x15, #0x100000000			//set up counter increment
	movi	v31.16b, #0x0
	mov	v31.d[1], x15
	mov	x5, x9

	sub	x5, x5, #1		//byte_len - 1

	rev32	v30.16b, v0.16b				//set up reversed counter

	add	v30.4s, v30.4s, v31.4s		//CTR block 0

	rev32	v1.16b, v30.16b				//CTR block 1
	add	v30.4s, v30.4s, v31.4s		//CTR block 1

	rev32	v2.16b, v30.16b				//CTR block 2
	add	v30.4s, v30.4s, v31.4s		//CTR block 2
	ldp	q26, q27, [x8, #0]				  	//load rk0, rk1

	rev32	v3.16b, v30.16b				//CTR block 3
	add	v30.4s, v30.4s, v31.4s		//CTR block 3

	rev32	v4.16b, v30.16b				//CTR block 4
	add	v30.4s, v30.4s, v31.4s		//CTR block 4

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 0

	rev32	v5.16b, v30.16b				//CTR block 5
	add	v30.4s, v30.4s, v31.4s		//CTR block 5

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 0

	rev32	v6.16b, v30.16b				//CTR block 6
	add	v30.4s, v30.4s, v31.4s		//CTR block 6

	rev32	v7.16b, v30.16b				//CTR block 7
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 0

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b		        //AES block 6 - round 0
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 0

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b		        //AES block 7 - round 0
	ldp	q28, q26, [x8, #32]				//load rk2, rk3

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b		        //AES block 6 - round 1
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b		        //AES block 4 - round 1
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b		        //AES block 0 - round 1

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 1
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 1

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 1

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 2

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 2

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
	ldp	q27, q28, [x8, #64]				//load rk4, rk5

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 3

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 3

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 3
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 3

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 3

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 4

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 4

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 4

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 5

	ldp	q26, q27, [x8, #96]				//load rk6, rk7
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 5

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 5

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 5

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 5

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 6

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 6

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
	ldp	q28, q26, [x8, #128]				//load rk8, rk9

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 7

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 7

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 7

	and	x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 8
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 8

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 8
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 8
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 8

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 8
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 8
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 8

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 9

	ld1	{ v19.16b}, [x3]
	ext	v19.16b, v19.16b, v19.16b, #8
	rev64	v19.16b, v19.16b
	ldp	q27, q28, [x8, #160]				//load rk10, rk11
	add	x4, x0, x1, lsr #3 //end_input_ptr
	add	x5, x5, x0

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 9
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 9

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 9
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 9

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 9

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 9
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 9

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 10
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 10
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 10

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 10
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 10
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 10

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 10
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 10
	ldp	q26, q27, [x8, #192]				//load rk12, rk13

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 11
	add	v30.4s, v30.4s, v31.4s //CTR block 7

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 11
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 11
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 11

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 11
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 11
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 11

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 11
	ldr	q28, [x8, #224]					//load rk14

	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 1 - round 12
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 4 - round 12
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 5 - round 12

	cmp	x0, x5				//check if we have <= 8 blocks
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 3 - round 12
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 2 - round 12

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 6 - round 12
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 0 - round 12
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 7 - round 12

	aese	v5.16b, v27.16b						//AES block 5 - round 13
	aese	v1.16b, v27.16b						//AES block 1 - round 13
	aese	v2.16b, v27.16b						//AES block 2 - round 13

	aese	v0.16b, v27.16b						//AES block 0 - round 13
	aese	v4.16b, v27.16b						//AES block 4 - round 13
	aese	v6.16b, v27.16b						//AES block 6 - round 13

	aese	v3.16b, v27.16b						//AES block 3 - round 13
	aese	v7.16b, v27.16b						//AES block 7 - round 13
	b.ge	.L256_dec_tail						//handle tail

	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load ciphertext

	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load ciphertext

	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load ciphertext

	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load ciphertext
	cmp	x0, x5				//check if we have <= 8 blocks

.inst	0xce017121	//eor3 v1.16b, v9.16b, v1.16b, v28.16b				//AES block 1 - result
.inst	0xce007100	//eor3 v0.16b, v8.16b, v0.16b, v28.16b				//AES block 0 - result
	stp	q0, q1, [x2], #32			//AES block 0, 1 - store result

	rev32	v0.16b, v30.16b				//CTR block 8
	add	v30.4s, v30.4s, v31.4s		//CTR block 8
.inst	0xce037163	//eor3 v3.16b, v11.16b, v3.16b, v28.16b				//AES block 3 - result

.inst	0xce0571a5	//eor3 v5.16b, v13.16b, v5.16b, v28.16b				//AES block 5 - result

.inst	0xce047184	//eor3 v4.16b, v12.16b, v4.16b, v28.16b				//AES block 4 - result
	rev32	v1.16b, v30.16b				//CTR block 9
	add	v30.4s, v30.4s, v31.4s		//CTR block 9

.inst	0xce027142	//eor3 v2.16b, v10.16b, v2.16b, v28.16b				//AES block 2 - result
	stp	q2, q3, [x2], #32			//AES block 2, 3 - store result

	rev32	v2.16b, v30.16b				//CTR block 10
	add	v30.4s, v30.4s, v31.4s		//CTR block 10

.inst	0xce0671c6	//eor3 v6.16b, v14.16b, v6.16b, v28.16b				//AES block 6 - result

	rev32	v3.16b, v30.16b				//CTR block 11
	add	v30.4s, v30.4s, v31.4s		//CTR block 11
	stp	q4, q5, [x2], #32			//AES block 4, 5 - store result

.inst	0xce0771e7	//eor3 v7.16b, v15.16b, v7.16b, v28.16b				//AES block 7 - result
	stp	q6, q7, [x2], #32			//AES block 6, 7 - store result

	rev32	v4.16b, v30.16b				//CTR block 12
	add	v30.4s, v30.4s, v31.4s		//CTR block 12
	b.ge	.L256_dec_prepretail					//do prepretail

.L256_dec_main_loop:	//main	loop start
	rev32	v5.16b, v30.16b				//CTR block 8k+13
	ldp	q26, q27, [x8, #0]					//load rk0, rk1
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13

	rev64	v9.16b, v9.16b						//GHASH block 8k+1
	ldr	q23, [x3, #176]				//load h7l | h7h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #208]				//load h8l | h8h
	ext	v25.16b, v25.16b, v25.16b, #8

	rev32	v6.16b, v30.16b				//CTR block 8k+14
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
	rev64	v8.16b, v8.16b						//GHASH block 8k

	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
	rev64	v12.16b, v12.16b						//GHASH block 8k+4
	rev64	v11.16b, v11.16b						//GHASH block 8k+3

	rev32	v7.16b, v30.16b				//CTR block 8k+15
	rev64	v15.16b, v15.16b						//GHASH block 8k+7

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0

	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
	ldp	q28, q26, [x8, #32]				//load rk2, rk3

	eor	v8.16b, v8.16b, v19.16b					//PRE 1
	ldr	q20, [x3, #128]				//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #160]				//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
	rev64	v10.16b, v10.16b						//GHASH block 8k+2
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1

	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2

	ldp	q27, q28, [x8, #64]				//load rk4, rk5
	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid

	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high

	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4

	ldr	q21, [x3, #144]				//load h6k | h5k
	ldr	q24, [x3, #192]				//load h8k | h7k
	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low

	ldp	q26, q27, [x8, #96]				//load rk6, rk7
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5

.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	rev64	v13.16b, v13.16b						//GHASH block 8k+5

	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5

	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6

	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6

	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low

	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8
	rev64	v14.16b, v14.16b						//GHASH block 8k+6
	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
	ldp	q28, q26, [x8, #128]				//load rk8, rk9

	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #64]				//load h2l | h2h
	ext	v22.16b, v22.16b, v22.16b, #8
.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7

	ldr	q21, [x3, #48]				//load h2k | h1k
	ldr	q24, [x3, #96]				//load h4k | h3k
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7

	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8

	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high

	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8

	ldp	q27, q28, [x8, #160]				//load rk10, rk11
	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid

	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9

	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load ciphertext
	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9

	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9

	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high

	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10

	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10

.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
	rev32	v20.16b, v30.16b					//CTR block 8k+16
	ldr	d16, [x10]			//MODULO - load modulo constant

	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 11
	ldp	q26, q27, [x8, #192]				//load rk12, rk13

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 11
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 11

.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
	rev32	v22.16b, v30.16b					//CTR block 8k+17
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 11

	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load ciphertext
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 11
	ext	v21.16b, v17.16b, v17.16b, #8				 //MODULO - other top alignment

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 11
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 11

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 12
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 12
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 12

	rev32	v23.16b, v30.16b					//CTR block 8k+18
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
	pmull	v29.1q, v17.1d, v16.1d			//MODULO - top 64b align with mid

.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 12
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 11

	ldr	q28, [x8, #224]					//load rk14
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 12
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 12

.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 12
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 12

	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load ciphertext
	aese	v1.16b, v27.16b						//AES block 8k+9 - round 13
	aese	v2.16b, v27.16b						//AES block 8k+10 - round 13

	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load ciphertext
	aese	v0.16b, v27.16b						//AES block 8k+8 - round 13
	aese	v5.16b, v27.16b						//AES block 8k+13 - round 13

	rev32	v25.16b, v30.16b					//CTR block 8k+19
.inst	0xce027142	//eor3 v2.16b, v10.16b, v2.16b, v28.16b				//AES block 8k+10 - result
.inst	0xce017121	//eor3 v1.16b, v9.16b, v1.16b, v28.16b				//AES block 8k+9 - result

	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
	aese	v7.16b, v27.16b						//AES block 8k+15 - round 13

	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19
	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
	aese	v4.16b, v27.16b						//AES block 8k+12 - round 13

.inst	0xce0571a5	//eor3 v5.16b, v13.16b, v5.16b, v28.16b				//AES block 8k+13 - result
.inst	0xce007100	//eor3 v0.16b, v8.16b, v0.16b, v28.16b				//AES block 8k+8 - result
	aese	v3.16b, v27.16b						//AES block 8k+11 - round 13

	stp	q0, q1, [x2], #32			//AES block 8k+8, 8k+9 - store result
	mov	v0.16b, v20.16b					//CTR block 8k+16
.inst	0xce047184	//eor3 v4.16b, v12.16b, v4.16b, v28.16b				//AES block 8k+12 - result

.inst	0xce154673	//eor3 v19.16b, v19.16b, v21.16b, v17.16b		 	//MODULO - fold into low
.inst	0xce037163	//eor3 v3.16b, v11.16b, v3.16b, v28.16b				//AES block 8k+11 - result
	stp	q2, q3, [x2], #32			//AES block 8k+10, 8k+11 - store result

	mov	v3.16b, v25.16b					//CTR block 8k+19
	mov	v2.16b, v23.16b					//CTR block 8k+18
	aese	v6.16b, v27.16b						//AES block 8k+14 - round 13

	mov	v1.16b, v22.16b					//CTR block 8k+17
	stp	q4, q5, [x2], #32			//AES block 8k+12, 8k+13 - store result
.inst	0xce0771e7	//eor3 v7.16b, v15.16b, v7.16b, v28.16b				//AES block 8k+15 - result

.inst	0xce0671c6	//eor3 v6.16b, v14.16b, v6.16b, v28.16b				//AES block 8k+14 - result
	rev32	v4.16b, v30.16b				//CTR block 8k+20
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20

	cmp	x0, x5				//.LOOP CONTROL
	stp	q6, q7, [x2], #32			//AES block 8k+14, 8k+15 - store result
	b.lt	.L256_dec_main_loop

.L256_dec_prepretail:	//PREPRETAIL
	ldp	q26, q27, [x8, #0]					//load rk0, rk1
	rev32	v5.16b, v30.16b				//CTR block 8k+13
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13

	rev64	v12.16b, v12.16b						//GHASH block 8k+4
	ldr	q21, [x3, #144]				//load h6k | h5k
	ldr	q24, [x3, #192]				//load h8k | h7k

	rev32	v6.16b, v30.16b				//CTR block 8k+14
	rev64	v8.16b, v8.16b						//GHASH block 8k
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14

	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
	ldr	q23, [x3, #176]				//load h7l | h7h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #208]				//load h8l | h8h
	ext	v25.16b, v25.16b, v25.16b, #8
	rev64	v9.16b, v9.16b						//GHASH block 8k+1

	rev32	v7.16b, v30.16b				//CTR block 8k+15
	rev64	v10.16b, v10.16b						//GHASH block 8k+2
	ldr	q20, [x3, #128]				//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #160]				//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0

	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0

	ldp	q28, q26, [x8, #32]				//load rk2, rk3
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
	eor	v8.16b, v8.16b, v19.16b					//PRE 1

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1

	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1

	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low

	rev64	v11.16b, v11.16b						//GHASH block 8k+3
	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
	rev64	v14.16b, v14.16b						//GHASH block 8k+6

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3

	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2

	ldp	q27, q28, [x8, #64]				//load rk4, rk5
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid

	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3

.inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid

	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low

	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4

.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8
	ldr	q22, [x3, #64]				//load h2l | h2h
	ext	v22.16b, v22.16b, v22.16b, #8
	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4

	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid

	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5

	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
	ldp	q26, q27, [x8, #96]				//load rk6, rk7

	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8
	rev64	v15.16b, v15.16b						//GHASH block 8k+7
	rev64	v13.16b, v13.16b						//GHASH block 8k+5

.inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid

	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
	ldr	q21, [x3, #48]				//load h2k | h1k
	ldr	q24, [x3, #96]				//load h4k | h3k
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6

	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6

	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low

	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6

	ldp	q28, q26, [x8, #128]				//load rk8, rk9
	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7

	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7

	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
.inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7

	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8

	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8

	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid

	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high

	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low

	ldp	q27, q28, [x8, #160]				//load rk10, rk11
.inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
.inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid

	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9

.inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
.inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
	ldr	d16, [x10]			//MODULO - load modulo constant

.inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid

	aese	v4.16b, v27.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
	aese	v6.16b, v27.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
	aese	v5.16b, v27.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10

	aese	v0.16b, v27.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
	aese	v2.16b, v27.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
	aese	v3.16b, v27.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10

.inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up

	aese	v7.16b, v27.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
	aese	v1.16b, v27.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
	ldp	q26, q27, [x8, #192]				//load rk12, rk13

	ext	v21.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment

	aese	v2.16b, v28.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 11
	aese	v1.16b, v28.16b
	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 11
	aese	v0.16b, v28.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 11

	pmull	v29.1q, v17.1d, v16.1d			//MODULO - top 64b align with mid
	aese	v3.16b, v28.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 11

	aese	v7.16b, v28.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 11
	aese	v6.16b, v28.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 11
	aese	v4.16b, v28.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 11

	aese	v5.16b, v28.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 11
	aese	v3.16b, v26.16b
	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 12

.inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid

	aese	v3.16b, v27.16b						//AES block 8k+11 - round 13
	aese	v2.16b, v26.16b
	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 12
	aese	v6.16b, v26.16b
	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 12

	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
	aese	v4.16b, v26.16b
	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 12
	aese	v7.16b, v26.16b
	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 12

	aese	v0.16b, v26.16b
	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 12
	ldr	q28, [x8, #224]					//load rk14
	aese	v1.16b, v26.16b
	aesmc	v1.16b, v1.16b	        	//AES block 8k+9 - round 12

	aese	v4.16b, v27.16b						//AES block 8k+12 - round 13
	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
	aese	v5.16b, v26.16b
	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 12

	aese	v6.16b, v27.16b						//AES block 8k+14 - round 13
	aese	v2.16b, v27.16b						//AES block 8k+10 - round 13
	aese	v1.16b, v27.16b						//AES block 8k+9 - round 13

	aese	v5.16b, v27.16b						//AES block 8k+13 - round 13
.inst	0xce154673	//eor3 v19.16b, v19.16b, v21.16b, v17.16b		 	//MODULO - fold into low
	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15

	aese	v7.16b, v27.16b						//AES block 8k+15 - round 13
	aese	v0.16b, v27.16b						//AES block 8k+8 - round 13
.L256_dec_tail:	//TAIL

	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
	sub	x5, x4, x0		//main_end_input_ptr is number of bytes left to process
	cmp	x5, #112

	ldr	q9, [x0], #16				//AES block 8k+8 - load ciphertext

	ldp	q24, q25, [x3, #192]			//load h8k | h7k
	ext	v25.16b, v25.16b, v25.16b, #8
	mov	v29.16b, v28.16b

	ldp	q20, q21, [x3, #128]			//load h5l | h5h
	ext	v20.16b, v20.16b, v20.16b, #8

.inst	0xce00752c	//eor3 v12.16b, v9.16b, v0.16b, v29.16b				//AES block 8k+8 - result
	ldp	q22, q23, [x3, #160]			//load h6l | h6h
	ext	v22.16b, v22.16b, v22.16b, #8
	ext	v23.16b, v23.16b, v23.16b, #8
	b.gt	.L256_dec_blocks_more_than_7

	mov	v7.16b, v6.16b
	sub	v30.4s, v30.4s, v31.4s
	mov	v6.16b, v5.16b

	mov	v5.16b, v4.16b
	mov	v4.16b, v3.16b
	movi	v19.8b, #0

	movi	v17.8b, #0
	movi	v18.8b, #0
	mov	v3.16b, v2.16b

	cmp	x5, #96
	mov	v2.16b, v1.16b
	b.gt	.L256_dec_blocks_more_than_6

	mov	v7.16b, v6.16b
	mov	v6.16b, v5.16b

	mov	v5.16b, v4.16b
	cmp	x5, #80
	sub	v30.4s, v30.4s, v31.4s

	mov	v4.16b, v3.16b
	mov	v3.16b, v1.16b
	b.gt	.L256_dec_blocks_more_than_5

	cmp	x5, #64
	mov	v7.16b, v6.16b
	sub	v30.4s, v30.4s, v31.4s

	mov	v6.16b, v5.16b

	mov	v5.16b, v4.16b
	mov	v4.16b, v1.16b
	b.gt	.L256_dec_blocks_more_than_4

	sub	v30.4s, v30.4s, v31.4s
	mov	v7.16b, v6.16b
	cmp	x5, #48

	mov	v6.16b, v5.16b
	mov	v5.16b, v1.16b
	b.gt	.L256_dec_blocks_more_than_3

	ldr	q24, [x3, #96]				//load h4k | h3k
	sub	v30.4s, v30.4s, v31.4s
	mov	v7.16b, v6.16b

	cmp	x5, #32
	mov	v6.16b, v1.16b
	b.gt	.L256_dec_blocks_more_than_2

	sub	v30.4s, v30.4s, v31.4s

	mov	v7.16b, v1.16b
	cmp	x5, #16
	b.gt	.L256_dec_blocks_more_than_1

	sub	v30.4s, v30.4s, v31.4s
	ldr	q21, [x3, #48]				//load h2k | h1k
	b	.L256_dec_blocks_less_than_1
.L256_dec_blocks_more_than_7:	//blocks	left >  7
	rev64	v8.16b, v9.16b						//GHASH final-7 block
	ldr	q9, [x0], #16				//AES final-6 block - load ciphertext
	st1	{ v12.16b}, [x2], #16				//AES final-7 block  - store result

	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
.inst	0xce01752c	//eor3 v12.16b, v9.16b, v1.16b, v29.16b				//AES final-6 block - result

	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
	movi	v16.8b, #0						//suppress further partial tag feed in

	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
	pmull	v18.1q, v27.1d, v18.1d			 	//GHASH final-7 block - mid
.L256_dec_blocks_more_than_6:	//blocks	left >  6

	rev64	v8.16b, v9.16b						//GHASH final-6 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
	ldr	q9, [x0], #16				//AES final-5 block - load ciphertext
	movi	v16.8b, #0						//suppress further partial tag feed in

	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid
	st1	{ v12.16b}, [x2], #16				//AES final-6 block - store result
	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high

	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low

.inst	0xce02752c	//eor3 v12.16b, v9.16b, v2.16b, v29.16b				//AES final-5 block - result
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid

	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
.L256_dec_blocks_more_than_5:	//blocks	left >  5

	rev64	v8.16b, v9.16b						//GHASH final-5 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high
	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid

	ldr	q9, [x0], #16				//AES final-4 block - load ciphertext

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid
	st1	{ v12.16b}, [x2], #16			  	//AES final-5 block - store result

	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid

	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
.inst	0xce03752c	//eor3 v12.16b, v9.16b, v3.16b, v29.16b				//AES final-4 block - result
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
	movi	v16.8b, #0						//suppress further partial tag feed in
.L256_dec_blocks_more_than_4:	//blocks	left >  4

	rev64	v8.16b, v9.16b						//GHASH final-4 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
	ldr	q9, [x0], #16				//AES final-3 block - load ciphertext

	movi	v16.8b, #0						//suppress further partial tag feed in

	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low
	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high

	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low
	st1	{ v12.16b}, [x2], #16			 	//AES final-4 block - store result

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
.inst	0xce04752c	//eor3 v12.16b, v9.16b, v4.16b, v29.16b				//AES final-3 block - result
.L256_dec_blocks_more_than_3:	//blocks	left >  3

	ldr	q25, [x3, #112]				//load h4l | h4h
	ext	v25.16b, v25.16b, v25.16b, #8
	rev64	v8.16b, v9.16b						//GHASH final-3 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
	ldr	q9, [x0], #16				//AES final-2 block - load ciphertext
	ldr	q24, [x3, #96]				//load h4k | h3k

	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
	st1	{ v12.16b}, [x2], #16			 	//AES final-3 block - store result

.inst	0xce05752c	//eor3 v12.16b, v9.16b, v5.16b, v29.16b				//AES final-2 block - result

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid

	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low
	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high

	movi	v16.8b, #0						//suppress further partial tag feed in
	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low

	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
.L256_dec_blocks_more_than_2:	//blocks	left >  2

	rev64	v8.16b, v9.16b						//GHASH final-2 block

	ldr	q23, [x3, #80]				//load h3l | h3h
	ext	v23.16b, v23.16b, v23.16b, #8
	ldr	q9, [x0], #16				//AES final-1 block - load ciphertext

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid

	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
	st1	{ v12.16b}, [x2], #16			  	//AES final-2 block - store result
.inst	0xce06752c	//eor3 v12.16b, v9.16b, v6.16b, v29.16b				//AES final-1 block - result

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid
	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
	movi	v16.8b, #0						//suppress further partial tag feed in

	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
.L256_dec_blocks_more_than_1:	//blocks	left >  1

	rev64	v8.16b, v9.16b						//GHASH final-1 block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
	ldr	q22, [x3, #64]				//load h2l | h2h
	ext	v22.16b, v22.16b, v22.16b, #8

	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid
	ldr	q9, [x0], #16				//AES final block - load ciphertext
	st1	{ v12.16b}, [x2], #16			 	//AES final-1 block - store result

	ldr	q21, [x3, #48]				//load h2k | h1k
	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low

	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid

	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low

.inst	0xce07752c	//eor3 v12.16b, v9.16b, v7.16b, v29.16b				//AES final block - result
	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high

	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid

	movi	v16.8b, #0						//suppress further partial tag feed in
	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high

	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
.L256_dec_blocks_less_than_1:	//blocks	left <= 1

	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
	and	x1, x1, #127				//bit_length %= 128

	sub	x1, x1, #128				//bit_length -= 128
	rev32	v30.16b, v30.16b
	str	q30, [x16]					//store the updated counter

	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])

	and	x1, x1, #127			 	//bit_length %= 128

	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
	cmp	x1, #64
	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff

	csel	x14, x6, xzr, lt
	csel	x13, x7, x6, lt

	mov	v0.d[0], x13					//ctr0b is mask for last block
	mov	v0.d[1], x14

	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits
	ldr	q20, [x3, #32]				//load h1l | h1h
	ext	v20.16b, v20.16b, v20.16b, #8
	bif	v12.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing

	rev64	v8.16b, v9.16b						//GHASH final block

	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag

	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high

	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid

	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low
	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high

	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid

	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
	ldr	d16, [x10]			//MODULO - load modulo constant
	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low

	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
	eor	v14.16b, v17.16b, v19.16b				//MODULO - karatsuba tidy up

	ext	v17.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
	st1	{ v12.16b}, [x2]				//store all 16B

	eor	v18.16b, v18.16b, v14.16b				//MODULO - karatsuba tidy up

	eor	v21.16b, v17.16b, v21.16b				//MODULO - fold into mid
	eor	v18.16b, v18.16b, v21.16b				//MODULO - fold into mid

	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low

	ext	v18.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
	eor	v19.16b, v19.16b, v17.16b				//MODULO - fold into low

	eor	v19.16b, v19.16b, v18.16b				//MODULO - fold into low
	ext	v19.16b, v19.16b, v19.16b, #8
	rev64	v19.16b, v19.16b
	st1	{ v19.16b }, [x3]
	mov	x0, x9

	ldp	d10, d11, [sp, #16]
	ldp	d12, d13, [sp, #32]
	ldp	d14, d15, [sp, #48]
	ldp	d8, d9, [sp], #80
	ret

.L256_dec_ret:
	mov	w0, #0x0
	ret
.size	unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
.byte	65,69,83,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,65,82,77,118,56,44,32,83,80,68,88,32,66,83,68,45,51,45,67,108,97,117,115,101,32,98,121,32,60,120,105,97,111,107,97,110,103,46,113,105,97,110,64,97,114,109,46,99,111,109,62,0
.align	2
.align	2
#endif
