arch/arm64/lib/memcpy.S - linux/kernel/git/davem/net - Git at Google

 /*
  * Copyright (C) 2013 ARM Ltd.
  * Copyright (C) 2013 Linaro.
  *
  * This code is based on glibc cortex strings work originally authored by Linaro
  * and re-licensed under GPLv2 for the Linux kernel. The original code can
  * be found @
  *
  * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
  * files/head:/src/aarch64/
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */

 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/cache.h>

 /*
  * Copy a buffer from src to dest (alignment handled by the hardware)
  *
  * Parameters:
  *	x0 - dest
  *	x1 - src
  *	x2 - n
  * Returns:
  *	x0 - dest
  */
 dstin	.req	x0
 src	.req	x1
 count	.req	x2
 tmp1	.req	x3
 tmp1w	.req	w3
 tmp2	.req	x4
 tmp2w	.req	w4
 tmp3	.req	x5
 tmp3w	.req	w5
 dst	.req	x6

 A_l	.req	x7
 A_h	.req	x8
 B_l	.req	x9
 B_h	.req	x10
 C_l	.req	x11
 C_h	.req	x12
 D_l	.req	x13
 D_h	.req	x14

 ENTRY(memcpy)
 	mov	dst, dstin
 	cmp	count, #16
 	/*When memory length is less than 16, the accessed are not aligned.*/
 	b.lo	.Ltiny15

 	neg	tmp2, src
 	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
 	b.eq	.LSrcAligned
 	sub	count, count, tmp2
 	/*
 	* Copy the leading memory data from src to dst in an increasing
 	* address order.By this way,the risk of overwritting the source
 	* memory data is eliminated when the distance between src and
 	* dst is less than 16. The memory accesses here are alignment.
 	*/
 	tbz	tmp2, #0, 1f
 	ldrb	tmp1w, [src], #1
 	strb	tmp1w, [dst], #1
 1:
 	tbz	tmp2, #1, 2f
 	ldrh	tmp1w, [src], #2
 	strh	tmp1w, [dst], #2
 2:
 	tbz	tmp2, #2, 3f
 	ldr	tmp1w, [src], #4
 	str	tmp1w, [dst], #4
 3:
 	tbz	tmp2, #3, .LSrcAligned
 	ldr	tmp1, [src],#8
 	str	tmp1, [dst],#8

 .LSrcAligned:
 	cmp	count, #64
 	b.ge	.Lcpy_over64
 	/*
 	* Deal with small copies quickly by dropping straight into the
 	* exit block.
 	*/
 .Ltail63:
 	/*
 	* Copy up to 48 bytes of data. At this point we only need the
 	* bottom 6 bits of count to be accurate.
 	*/
 	ands	tmp1, count, #0x30
 	b.eq	.Ltiny15
 	cmp	tmp1w, #0x20
 	b.eq	1f
 	b.lt	2f
 	ldp	A_l, A_h, [src], #16
 	stp	A_l, A_h, [dst], #16
 1:
 	ldp	A_l, A_h, [src], #16
 	stp	A_l, A_h, [dst], #16
 2:
 	ldp	A_l, A_h, [src], #16
 	stp	A_l, A_h, [dst], #16
 .Ltiny15:
 	/*
 	* Prefer to break one ldp/stp into several load/store to access
 	* memory in an increasing address order,rather than to load/store 16
 	* bytes from (src-16) to (dst-16) and to backward the src to aligned
 	* address,which way is used in original cortex memcpy. If keeping
 	* the original memcpy process here, memmove need to satisfy the
 	* precondition that src address is at least 16 bytes bigger than dst
 	* address,otherwise some source data will be overwritten when memove
 	* call memcpy directly. To make memmove simpler and decouple the
 	* memcpy's dependency on memmove, withdrew the original process.
 	*/
 	tbz	count, #3, 1f
 	ldr	tmp1, [src], #8
 	str	tmp1, [dst], #8
 1:
 	tbz	count, #2, 2f
 	ldr	tmp1w, [src], #4
 	str	tmp1w, [dst], #4
 2:
 	tbz	count, #1, 3f
 	ldrh	tmp1w, [src], #2
 	strh	tmp1w, [dst], #2
 3:
 	tbz	count, #0, .Lexitfunc
 	ldrb	tmp1w, [src]
 	strb	tmp1w, [dst]

 .Lexitfunc:
 	ret

 .Lcpy_over64:
 	subs	count, count, #128
 	b.ge	.Lcpy_body_large
 	/*
 	* Less than 128 bytes to copy, so handle 64 here and then jump
 	* to the tail.
 	*/
 	ldp	A_l, A_h, [src],#16
 	stp	A_l, A_h, [dst],#16
 	ldp	B_l, B_h, [src],#16
 	ldp	C_l, C_h, [src],#16
 	stp	B_l, B_h, [dst],#16
 	stp	C_l, C_h, [dst],#16
 	ldp	D_l, D_h, [src],#16
 	stp	D_l, D_h, [dst],#16

 	tst	count, #0x3f
 	b.ne	.Ltail63
 	ret

 	/*
 	* Critical loop.  Start at a new cache line boundary.  Assuming
 	* 64 bytes per line this ensures the entire loop is in one line.
 	*/
 	.p2align	L1_CACHE_SHIFT
 .Lcpy_body_large:
 	/* pre-get 64 bytes data. */
 	ldp	A_l, A_h, [src],#16
 	ldp	B_l, B_h, [src],#16
 	ldp	C_l, C_h, [src],#16
 	ldp	D_l, D_h, [src],#16
 1:
 	/*
 	* interlace the load of next 64 bytes data block with store of the last
 	* loaded 64 bytes data.
 	*/
 	stp	A_l, A_h, [dst],#16
 	ldp	A_l, A_h, [src],#16
 	stp	B_l, B_h, [dst],#16
 	ldp	B_l, B_h, [src],#16
 	stp	C_l, C_h, [dst],#16
 	ldp	C_l, C_h, [src],#16
 	stp	D_l, D_h, [dst],#16
 	ldp	D_l, D_h, [src],#16
 	subs	count, count, #64
 	b.ge	1b
 	stp	A_l, A_h, [dst],#16
 	stp	B_l, B_h, [dst],#16
 	stp	C_l, C_h, [dst],#16
 	stp	D_l, D_h, [dst],#16

 	tst	count, #0x3f
 	b.ne	.Ltail63
 	ret
 ENDPROC(memcpy)
	/*
	* Copyright (C) 2013 ARM Ltd.
	* Copyright (C) 2013 Linaro.
	*
	* This code is based on glibc cortex strings work originally authored by Linaro
	* and re-licensed under GPLv2 for the Linux kernel. The original code can
	* be found @
	*
	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
	* files/head:/src/aarch64/
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License version 2 as
	* published by the Free Software Foundation.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program. If not, see <http://www.gnu.org/licenses/>.
	*/

	#include <linux/linkage.h>
	#include <asm/assembler.h>
	#include <asm/cache.h>

	/*
	* Copy a buffer from src to dest (alignment handled by the hardware)
	*
	* Parameters:
	* x0 - dest
	* x1 - src
	* x2 - n
	* Returns:
	* x0 - dest
	*/
	dstin .req x0
	src .req x1
	count .req x2
	tmp1 .req x3
	tmp1w .req w3
	tmp2 .req x4
	tmp2w .req w4
	tmp3 .req x5
	tmp3w .req w5
	dst .req x6

	A_l .req x7
	A_h .req x8
	B_l .req x9
	B_h .req x10
	C_l .req x11
	C_h .req x12
	D_l .req x13
	D_h .req x14

	ENTRY(memcpy)
	mov dst, dstin
	cmp count, #16
	/When memory length is less than 16, the accessed are not aligned./
	b.lo .Ltiny15

	neg tmp2, src
	ands tmp2, tmp2, #15/* Bytes to reach alignment. */
	b.eq .LSrcAligned
	sub count, count, tmp2
	/*
	* Copy the leading memory data from src to dst in an increasing
	* address order.By this way,the risk of overwritting the source
	* memory data is eliminated when the distance between src and
	* dst is less than 16. The memory accesses here are alignment.
	*/
	tbz tmp2, #0, 1f
	ldrb tmp1w, [src], #1
	strb tmp1w, [dst], #1
	1:
	tbz tmp2, #1, 2f
	ldrh tmp1w, [src], #2
	strh tmp1w, [dst], #2
	2:
	tbz tmp2, #2, 3f
	ldr tmp1w, [src], #4
	str tmp1w, [dst], #4
	3:
	tbz tmp2, #3, .LSrcAligned
	ldr tmp1, [src],#8
	str tmp1, [dst],#8

	.LSrcAligned:
	cmp count, #64
	b.ge .Lcpy_over64
	/*
	* Deal with small copies quickly by dropping straight into the
	* exit block.
	*/
	.Ltail63:
	/*
	* Copy up to 48 bytes of data. At this point we only need the
	* bottom 6 bits of count to be accurate.
	*/
	ands tmp1, count, #0x30
	b.eq .Ltiny15
	cmp tmp1w, #0x20
	b.eq 1f
	b.lt 2f
	ldp A_l, A_h, [src], #16
	stp A_l, A_h, [dst], #16
	1:
	ldp A_l, A_h, [src], #16
	stp A_l, A_h, [dst], #16
	2:
	ldp A_l, A_h, [src], #16
	stp A_l, A_h, [dst], #16
	.Ltiny15:
	/*
	* Prefer to break one ldp/stp into several load/store to access
	* memory in an increasing address order,rather than to load/store 16
	* bytes from (src-16) to (dst-16) and to backward the src to aligned
	* address,which way is used in original cortex memcpy. If keeping
	* the original memcpy process here, memmove need to satisfy the
	* precondition that src address is at least 16 bytes bigger than dst
	* address,otherwise some source data will be overwritten when memove
	* call memcpy directly. To make memmove simpler and decouple the
	* memcpy's dependency on memmove, withdrew the original process.
	*/
	tbz count, #3, 1f
	ldr tmp1, [src], #8
	str tmp1, [dst], #8
	1:
	tbz count, #2, 2f
	ldr tmp1w, [src], #4
	str tmp1w, [dst], #4
	2:
	tbz count, #1, 3f
	ldrh tmp1w, [src], #2
	strh tmp1w, [dst], #2
	3:
	tbz count, #0, .Lexitfunc
	ldrb tmp1w, [src]
	strb tmp1w, [dst]

	.Lexitfunc:
	ret

	.Lcpy_over64:
	subs count, count, #128
	b.ge .Lcpy_body_large
	/*
	* Less than 128 bytes to copy, so handle 64 here and then jump
	* to the tail.
	*/
	ldp A_l, A_h, [src],#16
	stp A_l, A_h, [dst],#16
	ldp B_l, B_h, [src],#16
	ldp C_l, C_h, [src],#16
	stp B_l, B_h, [dst],#16
	stp C_l, C_h, [dst],#16
	ldp D_l, D_h, [src],#16
	stp D_l, D_h, [dst],#16

	tst count, #0x3f
	b.ne .Ltail63
	ret

	/*
	* Critical loop. Start at a new cache line boundary. Assuming
	* 64 bytes per line this ensures the entire loop is in one line.
	*/
	.p2align L1_CACHE_SHIFT
	.Lcpy_body_large:
	/* pre-get 64 bytes data. */
	ldp A_l, A_h, [src],#16
	ldp B_l, B_h, [src],#16
	ldp C_l, C_h, [src],#16
	ldp D_l, D_h, [src],#16
	1:
	/*
	* interlace the load of next 64 bytes data block with store of the last
	* loaded 64 bytes data.
	*/
	stp A_l, A_h, [dst],#16
	ldp A_l, A_h, [src],#16
	stp B_l, B_h, [dst],#16
	ldp B_l, B_h, [src],#16
	stp C_l, C_h, [dst],#16
	ldp C_l, C_h, [src],#16
	stp D_l, D_h, [dst],#16
	ldp D_l, D_h, [src],#16
	subs count, count, #64
	b.ge 1b
	stp A_l, A_h, [dst],#16
	stp B_l, B_h, [dst],#16
	stp C_l, C_h, [dst],#16
	stp D_l, D_h, [dst],#16

	tst count, #0x3f
	b.ne .Ltail63
	ret
	ENDPROC(memcpy)