|  | /* | 
|  | * Copyright (C) 2013 ARM Ltd. | 
|  | * Copyright (C) 2013 Linaro. | 
|  | * | 
|  | * This code is based on glibc cortex strings work originally authored by Linaro | 
|  | * and re-licensed under GPLv2 for the Linux kernel. The original code can | 
|  | * be found @ | 
|  | * | 
|  | * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ | 
|  | * files/head:/src/aarch64/ | 
|  | * | 
|  | * This program is free software; you can redistribute it and/or modify | 
|  | * it under the terms of the GNU General Public License version 2 as | 
|  | * published by the Free Software Foundation. | 
|  | * | 
|  | * This program is distributed in the hope that it will be useful, | 
|  | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|  | * GNU General Public License for more details. | 
|  | * | 
|  | * You should have received a copy of the GNU General Public License | 
|  | * along with this program.  If not, see <http://www.gnu.org/licenses/>. | 
|  | */ | 
|  |  | 
|  | #include <linux/linkage.h> | 
|  | #include <asm/assembler.h> | 
|  | #include <asm/cache.h> | 
|  |  | 
|  | /* | 
|  | * Move a buffer from src to test (alignment handled by the hardware). | 
|  | * If dest <= src, call memcpy, otherwise copy in reverse order. | 
|  | * | 
|  | * Parameters: | 
|  | *	x0 - dest | 
|  | *	x1 - src | 
|  | *	x2 - n | 
|  | * Returns: | 
|  | *	x0 - dest | 
|  | */ | 
|  | dstin	.req	x0 | 
|  | src	.req	x1 | 
|  | count	.req	x2 | 
|  | tmp1	.req	x3 | 
|  | tmp1w	.req	w3 | 
|  | tmp2	.req	x4 | 
|  | tmp2w	.req	w4 | 
|  | tmp3	.req	x5 | 
|  | tmp3w	.req	w5 | 
|  | dst	.req	x6 | 
|  |  | 
|  | A_l	.req	x7 | 
|  | A_h	.req	x8 | 
|  | B_l	.req	x9 | 
|  | B_h	.req	x10 | 
|  | C_l	.req	x11 | 
|  | C_h	.req	x12 | 
|  | D_l	.req	x13 | 
|  | D_h	.req	x14 | 
|  |  | 
|  | .weak memmove | 
|  | ENTRY(__memmove) | 
|  | ENTRY(memmove) | 
|  | cmp	dstin, src | 
|  | b.lo	__memcpy | 
|  | add	tmp1, src, count | 
|  | cmp	dstin, tmp1 | 
|  | b.hs	__memcpy		/* No overlap.  */ | 
|  |  | 
|  | add	dst, dstin, count | 
|  | add	src, src, count | 
|  | cmp	count, #16 | 
|  | b.lo	.Ltail15  /*probably non-alignment accesses.*/ | 
|  |  | 
|  | ands	tmp2, src, #15     /* Bytes to reach alignment.  */ | 
|  | b.eq	.LSrcAligned | 
|  | sub	count, count, tmp2 | 
|  | /* | 
|  | * process the aligned offset length to make the src aligned firstly. | 
|  | * those extra instructions' cost is acceptable. It also make the | 
|  | * coming accesses are based on aligned address. | 
|  | */ | 
|  | tbz	tmp2, #0, 1f | 
|  | ldrb	tmp1w, [src, #-1]! | 
|  | strb	tmp1w, [dst, #-1]! | 
|  | 1: | 
|  | tbz	tmp2, #1, 2f | 
|  | ldrh	tmp1w, [src, #-2]! | 
|  | strh	tmp1w, [dst, #-2]! | 
|  | 2: | 
|  | tbz	tmp2, #2, 3f | 
|  | ldr	tmp1w, [src, #-4]! | 
|  | str	tmp1w, [dst, #-4]! | 
|  | 3: | 
|  | tbz	tmp2, #3, .LSrcAligned | 
|  | ldr	tmp1, [src, #-8]! | 
|  | str	tmp1, [dst, #-8]! | 
|  |  | 
|  | .LSrcAligned: | 
|  | cmp	count, #64 | 
|  | b.ge	.Lcpy_over64 | 
|  |  | 
|  | /* | 
|  | * Deal with small copies quickly by dropping straight into the | 
|  | * exit block. | 
|  | */ | 
|  | .Ltail63: | 
|  | /* | 
|  | * Copy up to 48 bytes of data. At this point we only need the | 
|  | * bottom 6 bits of count to be accurate. | 
|  | */ | 
|  | ands	tmp1, count, #0x30 | 
|  | b.eq	.Ltail15 | 
|  | cmp	tmp1w, #0x20 | 
|  | b.eq	1f | 
|  | b.lt	2f | 
|  | ldp	A_l, A_h, [src, #-16]! | 
|  | stp	A_l, A_h, [dst, #-16]! | 
|  | 1: | 
|  | ldp	A_l, A_h, [src, #-16]! | 
|  | stp	A_l, A_h, [dst, #-16]! | 
|  | 2: | 
|  | ldp	A_l, A_h, [src, #-16]! | 
|  | stp	A_l, A_h, [dst, #-16]! | 
|  |  | 
|  | .Ltail15: | 
|  | tbz	count, #3, 1f | 
|  | ldr	tmp1, [src, #-8]! | 
|  | str	tmp1, [dst, #-8]! | 
|  | 1: | 
|  | tbz	count, #2, 2f | 
|  | ldr	tmp1w, [src, #-4]! | 
|  | str	tmp1w, [dst, #-4]! | 
|  | 2: | 
|  | tbz	count, #1, 3f | 
|  | ldrh	tmp1w, [src, #-2]! | 
|  | strh	tmp1w, [dst, #-2]! | 
|  | 3: | 
|  | tbz	count, #0, .Lexitfunc | 
|  | ldrb	tmp1w, [src, #-1] | 
|  | strb	tmp1w, [dst, #-1] | 
|  |  | 
|  | .Lexitfunc: | 
|  | ret | 
|  |  | 
|  | .Lcpy_over64: | 
|  | subs	count, count, #128 | 
|  | b.ge	.Lcpy_body_large | 
|  | /* | 
|  | * Less than 128 bytes to copy, so handle 64 bytes here and then jump | 
|  | * to the tail. | 
|  | */ | 
|  | ldp	A_l, A_h, [src, #-16] | 
|  | stp	A_l, A_h, [dst, #-16] | 
|  | ldp	B_l, B_h, [src, #-32] | 
|  | ldp	C_l, C_h, [src, #-48] | 
|  | stp	B_l, B_h, [dst, #-32] | 
|  | stp	C_l, C_h, [dst, #-48] | 
|  | ldp	D_l, D_h, [src, #-64]! | 
|  | stp	D_l, D_h, [dst, #-64]! | 
|  |  | 
|  | tst	count, #0x3f | 
|  | b.ne	.Ltail63 | 
|  | ret | 
|  |  | 
|  | /* | 
|  | * Critical loop. Start at a new cache line boundary. Assuming | 
|  | * 64 bytes per line this ensures the entire loop is in one line. | 
|  | */ | 
|  | .p2align	L1_CACHE_SHIFT | 
|  | .Lcpy_body_large: | 
|  | /* pre-load 64 bytes data. */ | 
|  | ldp	A_l, A_h, [src, #-16] | 
|  | ldp	B_l, B_h, [src, #-32] | 
|  | ldp	C_l, C_h, [src, #-48] | 
|  | ldp	D_l, D_h, [src, #-64]! | 
|  | 1: | 
|  | /* | 
|  | * interlace the load of next 64 bytes data block with store of the last | 
|  | * loaded 64 bytes data. | 
|  | */ | 
|  | stp	A_l, A_h, [dst, #-16] | 
|  | ldp	A_l, A_h, [src, #-16] | 
|  | stp	B_l, B_h, [dst, #-32] | 
|  | ldp	B_l, B_h, [src, #-32] | 
|  | stp	C_l, C_h, [dst, #-48] | 
|  | ldp	C_l, C_h, [src, #-48] | 
|  | stp	D_l, D_h, [dst, #-64]! | 
|  | ldp	D_l, D_h, [src, #-64]! | 
|  | subs	count, count, #64 | 
|  | b.ge	1b | 
|  | stp	A_l, A_h, [dst, #-16] | 
|  | stp	B_l, B_h, [dst, #-32] | 
|  | stp	C_l, C_h, [dst, #-48] | 
|  | stp	D_l, D_h, [dst, #-64]! | 
|  |  | 
|  | tst	count, #0x3f | 
|  | b.ne	.Ltail63 | 
|  | ret | 
|  | ENDPIPROC(memmove) | 
|  | ENDPROC(__memmove) |