/*-
 * Copyright (c) 2013 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Matt Thomas of 3am Software Foundry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <machine/asm.h>

RCSID("$NetBSD: memcpy_neon.S,v 1.3 2025/02/27 08:39:53 andvar Exp $")

	.text
ENTRY(memcpy)
	teq	r2, #0			/* 0 length? */
	cmpne	r0, r1			/*   if not, does src == dst? */
	RETc(eq)			/*   yes, (to either) return */

	mov	r3, r0			/* keep r0 unchanged */
#if 0
	cmp	r2, #16			/* copy less than 8 bytes? */
	bhs	.Ldst_aligner		/*   nope, do it the long way */

1:	ldrb	ip, [r1], #1		/* load a byte from src */
	subs	r2, r2, #1		/* and more to transfer? */
	strb	ip, [r3], #1		/* save it to dst */
	bne	1b			/*   yes, do next byte */
	RET				/* return */
#endif

.Ldst_aligner:
	tst	r3, #7			/* is dst pointer word aligned? */
	beq	.Lsrc_aligner		/*   yes, check src pointer */
	/*
	 * Until the dst pointer is word aligned, read src and dst byte by
	 * byte until it is aligned or we've copied everything.
	 */
	ldrb	ip, [r1], #1		/* load a byte from src */
	strb	ip, [r3], #1		/* save the byte to dst */
	subs	r2, r2, #1		/* end of transfer? */
	bne	.Ldst_aligner		/*   no, try next byte */
	RET				/* yes, we're done! */

.Lsrc_aligner:
	push	{r4-r5}			/* save some registers */
	add	r4, r2, r3		/* keep a pointer to the end of src */
	ands	r5, r1, #7		/* get misalignment of src pointer */
	beq	.Lcongruent_main	/*   aligned, do it the fast way */

	vdup.8	d1, r5			/* set offset for table */
	rsb	r5, r5, #8		/* calculate leftover of each word */
	bic	r1, r1, #7		/* dword align src pointer */

	vldr	d0, .Ltbl_value		/* load table value */
	vadd.u8	d0, d0, d1		/* add offset to it */

	vld1.64 {d1}, [r1:64]!		/* load a dword from src */

	cmp	r2, r5			/* do we already have enough? */
	bhi	.Lincongruent		/*   no, so read more */

.Lincongruent_finish:
	vtbl.8	d0, {d1-d2}, d0		/* merge last dwords */
	cmp	r2, #8			/* room for a full dword? */ 
#ifdef __ARMEB__
	vrev64.32 d0, d0		/* word swap to LE */
#endif
	blo	.Lfinish		/*   no, write final partial dword */
	vst1.32 {d0}, [r3:64]		/*   yes, write final full dword */
	b	.Ldone			/* and we're done! */

.Lincongruent:
	vld1.64 {d2}, [r1:64]!		/* load a dword */
	cmp	r2, #8			/* can we write a full dword? */
	blo	.Lincongruent_finish	/*   no, finish it. */
	vtbl.8	d1, {d1-d2}, d0		/* reorder */
	vst1.64 {d1}, [r3:64]!		/* store a dword */
	subs	r2, r2, #8		/* have we written everything? */
	beq	.Ldone			/*   yes, we're done! */
	vmov	d1, d2			/* prepare for next dword */
	tst	r3, #63			/* are we 64-byte aligned? */
	bne	.Lincongruent		/*   no, load next dword */

	/*
	 * We are now 64-byte aligneds so all writes should fill one or more
	 * cachelines.  Even if d1 has 7 bytes cached, to write 32 bytes we
	 * still need to read 4 dwords (3 full dwords and 1 dword for that
	 * last byte).
	 */
	cmp	r2, #32			/* can we write 4 more dwords? */
	blo	.Lincongruent_dword	/*   no, handle dword by dword */
	vld1.64 {d2-d5}, [r1:64]!	/* read 4 dwords */
	cmp	r2, #64			/* can we write 4 more dwords? */
	blo	.Lincongruent_4dword	/*   no, handle it */

1:	vld1.64 {d7-d10}, [r1:64]!	/* read 4 dwords */
	vtbl.8	d1, {d1-d2}, d0		/* reorder */
	vtbl.8	d2, {d2-d3}, d0		/* reorder */
	vtbl.8	d3, {d3-d4}, d0		/* reorder */
	vtbl.8	d4, {d4-d5}, d0		/* reorder */
	vst1.64 {d1-d4}, [r3:64]!	/* write 4 dwords */
	vmov	d6, d5			/* move out of the way the load */
	cmp	r2, #96			/* have 8+4 dwords to write? */
	blo	2f			/*   no more data, skip the load */
	vld1.64 {d2-d5}, [r1:64]!	/* more data, load 4 dwords */
2:	vtbl.8	d6, {d6-d7}, d0		/* reorder */
	vtbl.8	d7, {d7-d8}, d0		/* reorder */
	vtbl.8	d8, {d8-d9}, d0		/* reorder */
	vtbl.8	d9, {d9-d10}, d0	/* reorder */
	vst1.64 {d6-d9}, [r3:64]!	/* write 4 dwords */
	subs	r2, r2, #64
	beq	.Ldone
	vmov	d1, d10
	cmp	r2, #64
	bhs	1b

	/*
	 * we have leftovers in d1 and new untranslated date in d2-d5.
	 */
.Lincongruent_4dword:
	cmp	r2, #32
	blo	.Lincongruent_dword

	vtbl.8	d1, {d1-d2}, d0		/* reorder */
	vtbl.8	d2, {d2-d3}, d0		/* reorder */
	vtbl.8	d3, {d3-d4}, d0		/* reorder */
	vtbl.8	d4, {d4-d5}, d0		/* reorder */
	vst1.64 {d1-d4}, [r3:64]!	/* write 4 dwords */
	vmov	d1, d5			/* move leftovers */
	subs	r2, r2, #32
	beq	.Ldone

.Lincongruent_dword:
#if 0
	cmp	r2, r5			/* enough in leftovers? */
	bls	.Lincongruent_finish	/*   yes, finish it. */
	vld1.64 {d2}, [r1:64]!		/* load a dword */
	cmp	r2, #8			/* can we write a full dword? */
	blo	.Lincongruent_finish	/*   no, finish it. */
	vtbl.8	d1, {d1-d2}, d0		/* reorder */
	vst1.64 {d1}, [r3:64]!		/* store a dword */
	subs	r2, r2, #8		/* have we written everything? */
	beq	.Ldone			/*   yes, we're done! */
	b	.Lincongruent_dword	/* and go get it */
#else
	cmp	r2, r5			/* are the bytes we have enough? */
	bls	.Lincongruent_finish	/*   yes, finish it. */
	mov	ip, r2			/* get remaining count */
	bic	ip, ip, #7		/* truncate to a dword */
	rsb	ip, ip, #32		/* subtract from 32 */
	ands	r2, r2, #7		/* count mod 8 */
	add	pc, pc, ip, lsl #1	/* and jump! */
	nop
	vld1.64 {d2}, [r1:64]!		/* load a dword */
	vtbl.8	d1, {d1-d2}, d0		/* reorder */
	vst1.64 {d1}, [r3:64]!		/* store a dword */
	vmov	d1, d2			/* prepare for next dword */
	vld1.64 {d2}, [r1:64]!		/* load a dword */
	vtbl.8	d1, {d1-d2}, d0		/* reorder */
	vst1.64 {d1}, [r3:64]!		/* store a dword */
	vmov	d1, d2			/* prepare for next dword */
	vld1.64 {d2}, [r1:64]!		/* load a dword */
	vtbl.8	d1, {d1-d2}, d0		/* reorder */
	vst1.64 {d1}, [r3:64]!		/* store a dword */
	vmov	d1, d2			/* prepare for next dword */
	vld1.64 {d2}, [r1:64]!		/* load a dword */
	vtbl.8	d1, {d1-d2}, d0		/* reorder */
	vst1.64 {d1}, [r3:64]!		/* store a dword */
	vmov	d1, d2			/* prepare for next dword */
	beq	.Ldone
	vld1.64 {d2}, [r1:64]!		/* load a dword */
	b	.Lincongruent_finish	/* write last partial dword */
#endif

.Lcongruent_main:
	vld1.32 {d0}, [r1:64]!		/* load next dword */
	cmp	r2, #8			/* compare current ptr against end */
	blo	.Lfinish		/*   greater so write final dword */
	vst1.32 {d0}, [r3:64]!		/* store dword */
	subs	r2, r2, #8		/* compare current ptr against end */
	beq	.Ldone			/*   equal? we're done! */
	tst	r3, #63			/* have we hit a 64-byte boundary? */
	bne	.Lcongruent_main	/*   no, write next word */

	cmp	r2, #64			/* can we write 4 dwords? */
	blo	.Lcongruent_loop	/*   no, this dword by dword */
	vldm	r1!, {d0-d7}		/* load next 7 dwords */
	cmp	r2, #128		/* can we write 16 dwords */
	blo	3f			/*   no, then deal with 8 dwords */

	/*
	 * The following writes two 64-byte interleaving stores and loads.
	 */
1:	vldm	r1!, {d8-d15}		/* load next 8 dwords */
	vstm	r3!, {d0-d7}		/* store 8 more dwords */
	cmp	r2, #192		/* can we write 16+8 dwords? */
	blo	2f			/*   no, don't load the next 8 dwords */
	vldm	r1!, {d0-d7}		/*   yes, load next 8 dwords */
2:	vstm	r3!, {d8-d15}		/* store 8 more dwords */
	sub	r2, r2, #128		/* we just stored 16 (8+8) dwords */
	beq	.Ldone			/*   if 0, we're done! */
	cmp	r2, #128		/* can we write 16 dwords */
	bhs	1b			/*   yes, do it again */
	cmp	r2, #64			/* have we loaded 8 dwords? */
	blo	.Lcongruent_loop	/*   no, proceed to do it dword */

	/*
	 * We now have 8 dwords we can write in d0-d7.
	 */
3:	vstm	r3!, {d0-d7}		/* store 8 more dwords */
	subs	r2, r2, #64		/* we wrote 8 dwords */
	beq	.Ldone			/*   if 0, we're done! */

.Lcongruent_loop:
	vld1.32 {d0}, [r1]!		/* load dword from src */
	cmp	r2, #8			/* can we write a full dword? */
	blo	.Lfinish		/*   no, write last partial dword */
.Lcongruent_loop_start:
	vst1.32 {d0}, [r3]!		/* store dword into dst */
	subs	r2, r2, #8		/* subtract it from length */
	beq	.Ldone			/*   if 0, we're done! */
	vld1.32 {d0}, [r1]!		/* load dword from src */
	cmp	r2, #8			/* can we write a full dword? */
	bhs	.Lcongruent_loop_start	/*   yes, so do it */

.Lfinish:
	vmov	r4, r5, d0		/* get last dword from NEON */
	tst	r2, #4			/* do we have at least 4 bytes left? */
	strne	r4, [r3], #4		/* store the 1st word */
	movne	r4, r5			/* move 2nd word into place */
	tst	r2, #2			/* do we have at least 2 bytes left? */
#ifdef __ARMEB__
	movne	r4, r4, ror #16		/*   yes, swap halfwords */
#endif
	strneh	r4, [r3], #2		/*   yes, store the halfword */
#ifdef __ARMEL__
	movne	r4, r4, lsr #16		/*   yes, discard just written bytes */
#endif
	tst	r2, #1			/* do we have a final byte? */
#ifdef __ARMEB__
	movne	r4, r4, lsr #24		/*   yes, move MSB to LSB */
#endif
	strneb	r4, [r3], #1		/*   yes, store it */

.Ldone:
	pop	{r4-r5}			/* restore registers */
	RET

	.p2align 3
.Ltbl_value:
#ifdef __ARMEL__
	.quad	0x0706050403020100
#else
	.quad	0x0001020304050607
#endif
END(memcpy)
