arch/parisc/lib/memcpy.c - linux/kernel/git/bpf/bpf-next - Git at Google

 /*
  *    Optimized memory copy routines.
  *
  *    Copyright (C) 2004 Randolph Chung <tausq@debian.org>
  *
  *    This program is free software; you can redistribute it and/or modify
  *    it under the terms of the GNU General Public License as published by
  *    the Free Software Foundation; either version 2, or (at your option)
  *    any later version.
  *
  *    This program is distributed in the hope that it will be useful,
  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *    GNU General Public License for more details.
  *
  *    You should have received a copy of the GNU General Public License
  *    along with this program; if not, write to the Free Software
  *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  *    Portions derived from the GNU C Library
  *    Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
  *
  * Several strategies are tried to try to get the best performance for various
  * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
  * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
  * general registers.  Unaligned copies are handled either by aligning the
  * destination and then using shift-and-write method, or in a few cases by
  * falling back to a byte-at-a-time copy.
  *
  * I chose to implement this in C because it is easier to maintain and debug,
  * and in my experiments it appears that the C code generated by gcc (3.3/3.4
  * at the time of writing) is fairly optimal. Unfortunately some of the
  * semantics of the copy routine (exception handling) is difficult to express
  * in C, so we have to play some tricks to get it to work.
  *
  * All the loads and stores are done via explicit asm() code in order to use
  * the right space registers.
  *
  * Testing with various alignments and buffer sizes shows that this code is
  * often >10x faster than a simple byte-at-a-time copy, even for strangely
  * aligned operands. It is interesting to note that the glibc version
  * of memcpy (written in C) is actually quite fast already. This routine is
  * able to beat it by 30-40% for aligned copies because of the loop unrolling,
  * but in some cases the glibc version is still slightly faster. This lends
  * more credibility that gcc can generate very good code as long as we are
  * careful.
  *
  * TODO:
  * - cache prefetching needs more experimentation to get optimal settings
  * - try not to use the post-increment address modifiers; they create additional
  *   interlocks
  * - replace byte-copy loops with stybs sequences
  */

 #ifdef __KERNEL__
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/compiler.h>
 #include <asm/uaccess.h>
 #define s_space "%%sr1"
 #define d_space "%%sr2"
 #else
 #include "memcpy.h"
 #define s_space "%%sr0"
 #define d_space "%%sr0"
 #define pa_memcpy new2_copy
 #endif

 DECLARE_PER_CPU(struct exception_data, exception_data);

 #define preserve_branch(label)	do {					\
 	volatile int dummy;						\
 	/* The following branch is never taken, it's just here to  */	\
 	/* prevent gcc from optimizing away our exception code. */ 	\
 	if (unlikely(dummy != dummy))					\
 		goto label;						\
 } while (0)

 #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
 #define get_kernel_space() (0)

 #define MERGE(w0, sh_1, w1, sh_2)  ({					\
 	unsigned int _r;						\
 	asm volatile (							\
 	"mtsar %3\n"							\
 	"shrpw %1, %2, %%sar, %0\n"					\
 	: "=r"(_r)							\
 	: "r"(w0), "r"(w1), "r"(sh_2)					\
 	);								\
 	_r;								\
 })
 #define THRESHOLD	16

 #ifdef DEBUG_MEMCPY
 #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
 #else
 #define DPRINTF(fmt, args...)
 #endif

 #ifndef __LP64__
 #define EXC_WORD ".word"
 #else
 #define EXC_WORD ".dword"
 #endif

 #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)	\
 	__asm__ __volatile__ (				\
 	"1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n" 	\
 	"\t.section __ex_table,\"aw\"\n"		\
 	"\t" EXC_WORD "\t1b\n"				\
 	"\t" EXC_WORD "\t" #_e "\n"			\
 	"\t.previous\n"					\
 	: _tt(_t), "+r"(_a)				\
 	: 						\
 	: "r8")

 #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) 	\
 	__asm__ __volatile__ (				\
 	"1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n" 	\
 	"\t.section __ex_table,\"aw\"\n"		\
 	"\t" EXC_WORD "\t1b\n"				\
 	"\t" EXC_WORD "\t" #_e "\n"			\
 	"\t.previous\n"					\
 	: "+r"(_a) 					\
 	: _tt(_t)					\
 	: "r8")

 #define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
 #define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
 #define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
 #define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
 #define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
 #define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)

 #define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) 	\
 	__asm__ __volatile__ (				\
 	"1:\t" #_insn " " #_o "(" _s ",%1), %0\n"	\
 	"\t.section __ex_table,\"aw\"\n"		\
 	"\t" EXC_WORD "\t1b\n"				\
 	"\t" EXC_WORD "\t" #_e "\n"			\
 	"\t.previous\n"					\
 	: _tt(_t) 					\
 	: "r"(_a)					\
 	: "r8")

 #define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) 	\
 	__asm__ __volatile__ (				\
 	"1:\t" #_insn " %0, " #_o "(" _s ",%1)\n" 	\
 	"\t.section __ex_table,\"aw\"\n"		\
 	"\t" EXC_WORD "\t1b\n"				\
 	"\t" EXC_WORD "\t" #_e "\n"			\
 	"\t.previous\n"					\
 	: 						\
 	: _tt(_t), "r"(_a)				\
 	: "r8")

 #define ldw(_s,_o,_a,_t,_e)	def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
 #define stw(_s,_t,_o,_a,_e) 	def_store_insn(stw,"r",_s,_t,_o,_a,_e)

 #ifdef  CONFIG_PREFETCH
 extern inline void prefetch_src(const void *addr)
 {
 	__asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
 }

 extern inline void prefetch_dst(const void *addr)
 {
 	__asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
 }
 #else
 #define prefetch_src(addr)
 #define prefetch_dst(addr)
 #endif

 /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
  * per loop.  This code is derived from glibc.
  */
 static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len)
 {
 	/* gcc complains that a2 and a3 may be uninitialized, but actually
 	 * they cannot be.  Initialize a2/a3 to shut gcc up.
 	 */
 	register unsigned int a0, a1, a2 = 0, a3 = 0;
 	int sh_1, sh_2;
 	struct exception_data *d;

 	/* prefetch_src((const void *)src); */

 	/* Calculate how to shift a word read at the memory operation
 	   aligned srcp to make it aligned for copy.  */
 	sh_1 = 8 * (src % sizeof(unsigned int));
 	sh_2 = 8 * sizeof(unsigned int) - sh_1;

 	/* Make src aligned by rounding it down.  */
 	src &= -sizeof(unsigned int);

 	switch (len % 4)
 	{
 		case 2:
 			/* a1 = ((unsigned int *) src)[0];
 			   a2 = ((unsigned int *) src)[1]; */
 			ldw(s_space, 0, src, a1, cda_ldw_exc);
 			ldw(s_space, 4, src, a2, cda_ldw_exc);
 			src -= 1 * sizeof(unsigned int);
 			dst -= 3 * sizeof(unsigned int);
 			len += 2;
 			goto do1;
 		case 3:
 			/* a0 = ((unsigned int *) src)[0];
 			   a1 = ((unsigned int *) src)[1]; */
 			ldw(s_space, 0, src, a0, cda_ldw_exc);
 			ldw(s_space, 4, src, a1, cda_ldw_exc);
 			src -= 0 * sizeof(unsigned int);
 			dst -= 2 * sizeof(unsigned int);
 			len += 1;
 			goto do2;
 		case 0:
 			if (len == 0)
 				return 0;
 			/* a3 = ((unsigned int *) src)[0];
 			   a0 = ((unsigned int *) src)[1]; */
 			ldw(s_space, 0, src, a3, cda_ldw_exc);
 			ldw(s_space, 4, src, a0, cda_ldw_exc);
 			src -=-1 * sizeof(unsigned int);
 			dst -= 1 * sizeof(unsigned int);
 			len += 0;
 			goto do3;
 		case 1:
 			/* a2 = ((unsigned int *) src)[0];
 			   a3 = ((unsigned int *) src)[1]; */
 			ldw(s_space, 0, src, a2, cda_ldw_exc);
 			ldw(s_space, 4, src, a3, cda_ldw_exc);
 			src -=-2 * sizeof(unsigned int);
 			dst -= 0 * sizeof(unsigned int);
 			len -= 1;
 			if (len == 0)
 				goto do0;
 			goto do4;			/* No-op.  */
 	}

 	do
 	{
 		/* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
 do4:
 		/* a0 = ((unsigned int *) src)[0]; */
 		ldw(s_space, 0, src, a0, cda_ldw_exc);
 		/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
 		stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
 do3:
 		/* a1 = ((unsigned int *) src)[1]; */
 		ldw(s_space, 4, src, a1, cda_ldw_exc);
 		/* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
 		stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
 do2:
 		/* a2 = ((unsigned int *) src)[2]; */
 		ldw(s_space, 8, src, a2, cda_ldw_exc);
 		/* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
 		stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
 do1:
 		/* a3 = ((unsigned int *) src)[3]; */
 		ldw(s_space, 12, src, a3, cda_ldw_exc);
 		/* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
 		stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);

 		src += 4 * sizeof(unsigned int);
 		dst += 4 * sizeof(unsigned int);
 		len -= 4;
 	}
 	while (len != 0);

 do0:
 	/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
 	stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);

 	preserve_branch(handle_load_error);
 	preserve_branch(handle_store_error);

 	return 0;

 handle_load_error:
 	__asm__ __volatile__ ("cda_ldw_exc:\n");
 	d = &__get_cpu_var(exception_data);
 	DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
 		o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
 	return o_len * 4 - d->fault_addr + o_src;

 handle_store_error:
 	__asm__ __volatile__ ("cda_stw_exc:\n");
 	d = &__get_cpu_var(exception_data);
 	DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
 		o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
 	return o_len * 4 - d->fault_addr + o_dst;
 }


 /* Returns 0 for success, otherwise, returns number of bytes not transferred. */
 unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
 {
 	register unsigned long src, dst, t1, t2, t3;
 	register unsigned char *pcs, *pcd;
 	register unsigned int *pws, *pwd;
 	register double *pds, *pdd;
 	unsigned long ret = 0;
 	unsigned long o_dst, o_src, o_len;
 	struct exception_data *d;

 	src = (unsigned long)srcp;
 	dst = (unsigned long)dstp;
 	pcs = (unsigned char *)srcp;
 	pcd = (unsigned char *)dstp;

 	o_dst = dst; o_src = src; o_len = len;

 	/* prefetch_src((const void *)srcp); */

 	if (len < THRESHOLD)
 		goto byte_copy;

 	/* Check alignment */
 	t1 = (src ^ dst);
 	if (unlikely(t1 & (sizeof(double)-1)))
 		goto unaligned_copy;

 	/* src and dst have same alignment. */

 	/* Copy bytes till we are double-aligned. */
 	t2 = src & (sizeof(double) - 1);
 	if (unlikely(t2 != 0)) {
 		t2 = sizeof(double) - t2;
 		while (t2 && len) {
 			/* *pcd++ = *pcs++; */
 			ldbma(s_space, pcs, t3, pmc_load_exc);
 			len--;
 			stbma(d_space, t3, pcd, pmc_store_exc);
 			t2--;
 		}
 	}

 	pds = (double *)pcs;
 	pdd = (double *)pcd;

 #if 0
 	/* Copy 8 doubles at a time */
 	while (len >= 8*sizeof(double)) {
 		register double r1, r2, r3, r4, r5, r6, r7, r8;
 		/* prefetch_src((char *)pds + L1_CACHE_BYTES); */
 		flddma(s_space, pds, r1, pmc_load_exc);
 		flddma(s_space, pds, r2, pmc_load_exc);
 		flddma(s_space, pds, r3, pmc_load_exc);
 		flddma(s_space, pds, r4, pmc_load_exc);
 		fstdma(d_space, r1, pdd, pmc_store_exc);
 		fstdma(d_space, r2, pdd, pmc_store_exc);
 		fstdma(d_space, r3, pdd, pmc_store_exc);
 		fstdma(d_space, r4, pdd, pmc_store_exc);

 #if 0
 		if (L1_CACHE_BYTES <= 32)
 			prefetch_src((char *)pds + L1_CACHE_BYTES);
 #endif
 		flddma(s_space, pds, r5, pmc_load_exc);
 		flddma(s_space, pds, r6, pmc_load_exc);
 		flddma(s_space, pds, r7, pmc_load_exc);
 		flddma(s_space, pds, r8, pmc_load_exc);
 		fstdma(d_space, r5, pdd, pmc_store_exc);
 		fstdma(d_space, r6, pdd, pmc_store_exc);
 		fstdma(d_space, r7, pdd, pmc_store_exc);
 		fstdma(d_space, r8, pdd, pmc_store_exc);
 		len -= 8*sizeof(double);
 	}
 #endif

 	pws = (unsigned int *)pds;
 	pwd = (unsigned int *)pdd;

 word_copy:
 	while (len >= 8*sizeof(unsigned int)) {
 		register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
 		/* prefetch_src((char *)pws + L1_CACHE_BYTES); */
 		ldwma(s_space, pws, r1, pmc_load_exc);
 		ldwma(s_space, pws, r2, pmc_load_exc);
 		ldwma(s_space, pws, r3, pmc_load_exc);
 		ldwma(s_space, pws, r4, pmc_load_exc);
 		stwma(d_space, r1, pwd, pmc_store_exc);
 		stwma(d_space, r2, pwd, pmc_store_exc);
 		stwma(d_space, r3, pwd, pmc_store_exc);
 		stwma(d_space, r4, pwd, pmc_store_exc);

 		ldwma(s_space, pws, r5, pmc_load_exc);
 		ldwma(s_space, pws, r6, pmc_load_exc);
 		ldwma(s_space, pws, r7, pmc_load_exc);
 		ldwma(s_space, pws, r8, pmc_load_exc);
 		stwma(d_space, r5, pwd, pmc_store_exc);
 		stwma(d_space, r6, pwd, pmc_store_exc);
 		stwma(d_space, r7, pwd, pmc_store_exc);
 		stwma(d_space, r8, pwd, pmc_store_exc);
 		len -= 8*sizeof(unsigned int);
 	}

 	while (len >= 4*sizeof(unsigned int)) {
 		register unsigned int r1,r2,r3,r4;
 		ldwma(s_space, pws, r1, pmc_load_exc);
 		ldwma(s_space, pws, r2, pmc_load_exc);
 		ldwma(s_space, pws, r3, pmc_load_exc);
 		ldwma(s_space, pws, r4, pmc_load_exc);
 		stwma(d_space, r1, pwd, pmc_store_exc);
 		stwma(d_space, r2, pwd, pmc_store_exc);
 		stwma(d_space, r3, pwd, pmc_store_exc);
 		stwma(d_space, r4, pwd, pmc_store_exc);
 		len -= 4*sizeof(unsigned int);
 	}

 	pcs = (unsigned char *)pws;
 	pcd = (unsigned char *)pwd;

 byte_copy:
 	while (len) {
 		/* *pcd++ = *pcs++; */
 		ldbma(s_space, pcs, t3, pmc_load_exc);
 		stbma(d_space, t3, pcd, pmc_store_exc);
 		len--;
 	}

 	return 0;

 unaligned_copy:
 	/* possibly we are aligned on a word, but not on a double... */
 	if (likely(t1 & (sizeof(unsigned int)-1)) == 0) {
 		t2 = src & (sizeof(unsigned int) - 1);

 		if (unlikely(t2 != 0)) {
 			t2 = sizeof(unsigned int) - t2;
 			while (t2) {
 				/* *pcd++ = *pcs++; */
 				ldbma(s_space, pcs, t3, pmc_load_exc);
 				stbma(d_space, t3, pcd, pmc_store_exc);
 				len--;
 				t2--;
 			}
 		}

 		pws = (unsigned int *)pcs;
 		pwd = (unsigned int *)pcd;
 		goto word_copy;
 	}

 	/* Align the destination.  */
 	if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
 		t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
 		while (t2) {
 			/* *pcd++ = *pcs++; */
 			ldbma(s_space, pcs, t3, pmc_load_exc);
 			stbma(d_space, t3, pcd, pmc_store_exc);
 			len--;
 			t2--;
 		}
 		dst = (unsigned long)pcd;
 		src = (unsigned long)pcs;
 	}

 	ret = copy_dstaligned(dst, src, len / sizeof(unsigned int),
 		o_dst, o_src, o_len);
 	if (ret)
 		return ret;

 	pcs += (len & -sizeof(unsigned int));
 	pcd += (len & -sizeof(unsigned int));
 	len %= sizeof(unsigned int);

 	preserve_branch(handle_load_error);
 	preserve_branch(handle_store_error);

 	goto byte_copy;

 handle_load_error:
 	__asm__ __volatile__ ("pmc_load_exc:\n");
 	d = &__get_cpu_var(exception_data);
 	DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
 		o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
 	return o_len - d->fault_addr + o_src;

 handle_store_error:
 	__asm__ __volatile__ ("pmc_store_exc:\n");
 	d = &__get_cpu_var(exception_data);
 	DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
 		o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
 	return o_len - d->fault_addr + o_dst;
 }

 #ifdef __KERNEL__
 unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
 {
 	mtsp(get_kernel_space(), 1);
 	mtsp(get_user_space(), 2);
 	return pa_memcpy((void __force *)dst, src, len);
 }

 unsigned long copy_from_user(void *dst, const void __user *src, unsigned long len)
 {
 	mtsp(get_user_space(), 1);
 	mtsp(get_kernel_space(), 2);
 	return pa_memcpy(dst, (void __force *)src, len);
 }

 unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
 {
 	mtsp(get_user_space(), 1);
 	mtsp(get_user_space(), 2);
 	return pa_memcpy((void __force *)dst, (void __force *)src, len);
 }


 void * memcpy(void * dst,const void *src, size_t count)
 {
 	mtsp(get_kernel_space(), 1);
 	mtsp(get_kernel_space(), 2);
 	pa_memcpy(dst, src, count);
 	return dst;
 }

 EXPORT_SYMBOL(copy_to_user);
 EXPORT_SYMBOL(copy_from_user);
 EXPORT_SYMBOL(copy_in_user);
 EXPORT_SYMBOL(memcpy);
 #endif
	/*
	* Optimized memory copy routines.
	*
	* Copyright (C) 2004 Randolph Chung <tausq@debian.org>
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 2, or (at your option)
	* any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	*
	* Portions derived from the GNU C Library
	* Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
	*
	* Several strategies are tried to try to get the best performance for various
	* conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
	* fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
	* general registers. Unaligned copies are handled either by aligning the
	* destination and then using shift-and-write method, or in a few cases by
	* falling back to a byte-at-a-time copy.
	*
	* I chose to implement this in C because it is easier to maintain and debug,
	* and in my experiments it appears that the C code generated by gcc (3.3/3.4
	* at the time of writing) is fairly optimal. Unfortunately some of the
	* semantics of the copy routine (exception handling) is difficult to express
	* in C, so we have to play some tricks to get it to work.
	*
	* All the loads and stores are done via explicit asm() code in order to use
	* the right space registers.
	*
	* Testing with various alignments and buffer sizes shows that this code is
	* often >10x faster than a simple byte-at-a-time copy, even for strangely
	* aligned operands. It is interesting to note that the glibc version
	* of memcpy (written in C) is actually quite fast already. This routine is
	* able to beat it by 30-40% for aligned copies because of the loop unrolling,
	* but in some cases the glibc version is still slightly faster. This lends
	* more credibility that gcc can generate very good code as long as we are
	* careful.
	*
	* TODO:
	* - cache prefetching needs more experimentation to get optimal settings
	* - try not to use the post-increment address modifiers; they create additional
	* interlocks
	* - replace byte-copy loops with stybs sequences
	*/

	#ifdef __KERNEL__
	#include <linux/config.h>
	#include <linux/module.h>
	#include <linux/compiler.h>
	#include <asm/uaccess.h>
	#define s_space "%%sr1"
	#define d_space "%%sr2"
	#else
	#include "memcpy.h"
	#define s_space "%%sr0"
	#define d_space "%%sr0"
	#define pa_memcpy new2_copy
	#endif

	DECLARE_PER_CPU(struct exception_data, exception_data);

	#define preserve_branch(label) do { \
	volatile int dummy; \
	/* The following branch is never taken, it's just here to */ \
	/* prevent gcc from optimizing away our exception code. */ \
	if (unlikely(dummy != dummy)) \
	goto label; \
	} while (0)

	#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
	#define get_kernel_space() (0)

	#define MERGE(w0, sh_1, w1, sh_2) ({ \
	unsigned int _r; \
	asm volatile ( \
	"mtsar %3\n" \
	"shrpw %1, %2, %%sar, %0\n" \
	: "=r"(_r) \
	: "r"(w0), "r"(w1), "r"(sh_2) \
	); \
	_r; \
	})
	#define THRESHOLD 16

	#ifdef DEBUG_MEMCPY
	#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
	#else
	#define DPRINTF(fmt, args...)
	#endif

	#ifndef __LP64__
	#define EXC_WORD ".word"
	#else
	#define EXC_WORD ".dword"
	#endif

	#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
	__asm__ __volatile__ ( \
	"1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n" \
	"\t.section __ex_table,\"aw\"\n" \
	"\t" EXC_WORD "\t1b\n" \
	"\t" EXC_WORD "\t" #_e "\n" \
	"\t.previous\n" \
	: _tt(_t), "+r"(_a) \
	: \
	: "r8")

	#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
	__asm__ __volatile__ ( \
	"1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n" \
	"\t.section __ex_table,\"aw\"\n" \
	"\t" EXC_WORD "\t1b\n" \
	"\t" EXC_WORD "\t" #_e "\n" \
	"\t.previous\n" \
	: "+r"(_a) \
	: _tt(_t) \
	: "r8")

	#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
	#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
	#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
	#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
	#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
	#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)

	#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \
	__asm__ __volatile__ ( \
	"1:\t" #_insn " " #_o "(" _s ",%1), %0\n" \
	"\t.section __ex_table,\"aw\"\n" \
	"\t" EXC_WORD "\t1b\n" \
	"\t" EXC_WORD "\t" #_e "\n" \
	"\t.previous\n" \
	: _tt(_t) \
	: "r"(_a) \
	: "r8")

	#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \
	__asm__ __volatile__ ( \
	"1:\t" #_insn " %0, " #_o "(" _s ",%1)\n" \
	"\t.section __ex_table,\"aw\"\n" \
	"\t" EXC_WORD "\t1b\n" \
	"\t" EXC_WORD "\t" #_e "\n" \
	"\t.previous\n" \
	: \
	: _tt(_t), "r"(_a) \
	: "r8")

	#define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
	#define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e)

	#ifdef CONFIG_PREFETCH
	extern inline void prefetch_src(const void *addr)
	{
	__asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
	}

	extern inline void prefetch_dst(const void *addr)
	{
	__asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
	}
	#else
	#define prefetch_src(addr)
	#define prefetch_dst(addr)
	#endif

	/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
	* per loop. This code is derived from glibc.
	*/
	static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len)
	{
	/* gcc complains that a2 and a3 may be uninitialized, but actually
	* they cannot be. Initialize a2/a3 to shut gcc up.
	*/
	register unsigned int a0, a1, a2 = 0, a3 = 0;
	int sh_1, sh_2;
	struct exception_data *d;

	/* prefetch_src((const void )src); /

	/* Calculate how to shift a word read at the memory operation
	aligned srcp to make it aligned for copy. */
	sh_1 = 8 * (src % sizeof(unsigned int));
	sh_2 = 8 * sizeof(unsigned int) - sh_1;

	/* Make src aligned by rounding it down. */
	src &= -sizeof(unsigned int);

	switch (len % 4)
	{
	case 2:
	/* a1 = ((unsigned int *) src)[0];
	a2 = ((unsigned int ) src)[1]; /
	ldw(s_space, 0, src, a1, cda_ldw_exc);
	ldw(s_space, 4, src, a2, cda_ldw_exc);
	src -= 1 * sizeof(unsigned int);
	dst -= 3 * sizeof(unsigned int);
	len += 2;
	goto do1;
	case 3:
	/* a0 = ((unsigned int *) src)[0];
	a1 = ((unsigned int ) src)[1]; /
	ldw(s_space, 0, src, a0, cda_ldw_exc);
	ldw(s_space, 4, src, a1, cda_ldw_exc);
	src -= 0 * sizeof(unsigned int);
	dst -= 2 * sizeof(unsigned int);
	len += 1;
	goto do2;
	case 0:
	if (len == 0)
	return 0;
	/* a3 = ((unsigned int *) src)[0];
	a0 = ((unsigned int ) src)[1]; /
	ldw(s_space, 0, src, a3, cda_ldw_exc);
	ldw(s_space, 4, src, a0, cda_ldw_exc);
	src -=-1 * sizeof(unsigned int);
	dst -= 1 * sizeof(unsigned int);
	len += 0;
	goto do3;
	case 1:
	/* a2 = ((unsigned int *) src)[0];
	a3 = ((unsigned int ) src)[1]; /
	ldw(s_space, 0, src, a2, cda_ldw_exc);
	ldw(s_space, 4, src, a3, cda_ldw_exc);
	src -=-2 * sizeof(unsigned int);
	dst -= 0 * sizeof(unsigned int);
	len -= 1;
	if (len == 0)
	goto do0;
	goto do4; /* No-op. */
	}

	do
	{
	/* prefetch_src((const void )(src + 4 sizeof(unsigned int))); */
	do4:
	/* a0 = ((unsigned int ) src)[0]; /
	ldw(s_space, 0, src, a0, cda_ldw_exc);
	/* ((unsigned int ) dst)[0] = MERGE (a2, sh_1, a3, sh_2); /
	stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
	do3:
	/* a1 = ((unsigned int ) src)[1]; /
	ldw(s_space, 4, src, a1, cda_ldw_exc);
	/* ((unsigned int ) dst)[1] = MERGE (a3, sh_1, a0, sh_2); /
	stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
	do2:
	/* a2 = ((unsigned int ) src)[2]; /
	ldw(s_space, 8, src, a2, cda_ldw_exc);
	/* ((unsigned int ) dst)[2] = MERGE (a0, sh_1, a1, sh_2); /
	stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
	do1:
	/* a3 = ((unsigned int ) src)[3]; /
	ldw(s_space, 12, src, a3, cda_ldw_exc);
	/* ((unsigned int ) dst)[3] = MERGE (a1, sh_1, a2, sh_2); /
	stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);

	src += 4 * sizeof(unsigned int);
	dst += 4 * sizeof(unsigned int);
	len -= 4;
	}
	while (len != 0);

	do0:
	/* ((unsigned int ) dst)[0] = MERGE (a2, sh_1, a3, sh_2); /
	stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);

	preserve_branch(handle_load_error);
	preserve_branch(handle_store_error);

	return 0;

	handle_load_error:
	__asm__ __volatile__ ("cda_ldw_exc:\n");
	d = &__get_cpu_var(exception_data);
	DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
	o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
	return o_len * 4 - d->fault_addr + o_src;

	handle_store_error:
	__asm__ __volatile__ ("cda_stw_exc:\n");
	d = &__get_cpu_var(exception_data);
	DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
	o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
	return o_len * 4 - d->fault_addr + o_dst;
	}


	/* Returns 0 for success, otherwise, returns number of bytes not transferred. */
	unsigned long pa_memcpy(void dstp, const void srcp, unsigned long len)
	{
	register unsigned long src, dst, t1, t2, t3;
	register unsigned char pcs, pcd;
	register unsigned int pws, pwd;
	register double pds, pdd;
	unsigned long ret = 0;
	unsigned long o_dst, o_src, o_len;
	struct exception_data *d;

	src = (unsigned long)srcp;
	dst = (unsigned long)dstp;
	pcs = (unsigned char *)srcp;
	pcd = (unsigned char *)dstp;

	o_dst = dst; o_src = src; o_len = len;

	/* prefetch_src((const void )srcp); /

	if (len < THRESHOLD)
	goto byte_copy;

	/* Check alignment */
	t1 = (src ^ dst);
	if (unlikely(t1 & (sizeof(double)-1)))
	goto unaligned_copy;

	/* src and dst have same alignment. */

	/* Copy bytes till we are double-aligned. */
	t2 = src & (sizeof(double) - 1);
	if (unlikely(t2 != 0)) {
	t2 = sizeof(double) - t2;
	while (t2 && len) {
	/* pcd++ = pcs++; */
	ldbma(s_space, pcs, t3, pmc_load_exc);
	len--;
	stbma(d_space, t3, pcd, pmc_store_exc);
	t2--;
	}
	}

	pds = (double *)pcs;
	pdd = (double *)pcd;

	#if 0
	/* Copy 8 doubles at a time */
	while (len >= 8*sizeof(double)) {
	register double r1, r2, r3, r4, r5, r6, r7, r8;
	/* prefetch_src((char )pds + L1_CACHE_BYTES); /
	flddma(s_space, pds, r1, pmc_load_exc);
	flddma(s_space, pds, r2, pmc_load_exc);
	flddma(s_space, pds, r3, pmc_load_exc);
	flddma(s_space, pds, r4, pmc_load_exc);
	fstdma(d_space, r1, pdd, pmc_store_exc);
	fstdma(d_space, r2, pdd, pmc_store_exc);
	fstdma(d_space, r3, pdd, pmc_store_exc);
	fstdma(d_space, r4, pdd, pmc_store_exc);

	#if 0
	if (L1_CACHE_BYTES <= 32)
	prefetch_src((char *)pds + L1_CACHE_BYTES);
	#endif
	flddma(s_space, pds, r5, pmc_load_exc);
	flddma(s_space, pds, r6, pmc_load_exc);
	flddma(s_space, pds, r7, pmc_load_exc);
	flddma(s_space, pds, r8, pmc_load_exc);
	fstdma(d_space, r5, pdd, pmc_store_exc);
	fstdma(d_space, r6, pdd, pmc_store_exc);
	fstdma(d_space, r7, pdd, pmc_store_exc);
	fstdma(d_space, r8, pdd, pmc_store_exc);
	len -= 8*sizeof(double);
	}
	#endif

	pws = (unsigned int *)pds;
	pwd = (unsigned int *)pdd;

	word_copy:
	while (len >= 8*sizeof(unsigned int)) {
	register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
	/* prefetch_src((char )pws + L1_CACHE_BYTES); /
	ldwma(s_space, pws, r1, pmc_load_exc);
	ldwma(s_space, pws, r2, pmc_load_exc);
	ldwma(s_space, pws, r3, pmc_load_exc);
	ldwma(s_space, pws, r4, pmc_load_exc);
	stwma(d_space, r1, pwd, pmc_store_exc);
	stwma(d_space, r2, pwd, pmc_store_exc);
	stwma(d_space, r3, pwd, pmc_store_exc);
	stwma(d_space, r4, pwd, pmc_store_exc);

	ldwma(s_space, pws, r5, pmc_load_exc);
	ldwma(s_space, pws, r6, pmc_load_exc);
	ldwma(s_space, pws, r7, pmc_load_exc);
	ldwma(s_space, pws, r8, pmc_load_exc);
	stwma(d_space, r5, pwd, pmc_store_exc);
	stwma(d_space, r6, pwd, pmc_store_exc);
	stwma(d_space, r7, pwd, pmc_store_exc);
	stwma(d_space, r8, pwd, pmc_store_exc);
	len -= 8*sizeof(unsigned int);
	}

	while (len >= 4*sizeof(unsigned int)) {
	register unsigned int r1,r2,r3,r4;
	ldwma(s_space, pws, r1, pmc_load_exc);
	ldwma(s_space, pws, r2, pmc_load_exc);
	ldwma(s_space, pws, r3, pmc_load_exc);
	ldwma(s_space, pws, r4, pmc_load_exc);
	stwma(d_space, r1, pwd, pmc_store_exc);
	stwma(d_space, r2, pwd, pmc_store_exc);
	stwma(d_space, r3, pwd, pmc_store_exc);
	stwma(d_space, r4, pwd, pmc_store_exc);
	len -= 4*sizeof(unsigned int);
	}

	pcs = (unsigned char *)pws;
	pcd = (unsigned char *)pwd;

	byte_copy:
	while (len) {
	/* pcd++ = pcs++; */
	ldbma(s_space, pcs, t3, pmc_load_exc);
	stbma(d_space, t3, pcd, pmc_store_exc);
	len--;
	}

	return 0;

	unaligned_copy:
	/* possibly we are aligned on a word, but not on a double... */
	if (likely(t1 & (sizeof(unsigned int)-1)) == 0) {
	t2 = src & (sizeof(unsigned int) - 1);

	if (unlikely(t2 != 0)) {
	t2 = sizeof(unsigned int) - t2;
	while (t2) {
	/* pcd++ = pcs++; */
	ldbma(s_space, pcs, t3, pmc_load_exc);
	stbma(d_space, t3, pcd, pmc_store_exc);
	len--;
	t2--;
	}
	}

	pws = (unsigned int *)pcs;
	pwd = (unsigned int *)pcd;
	goto word_copy;
	}

	/* Align the destination. */
	if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
	t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
	while (t2) {
	/* pcd++ = pcs++; */
	ldbma(s_space, pcs, t3, pmc_load_exc);
	stbma(d_space, t3, pcd, pmc_store_exc);
	len--;
	t2--;
	}
	dst = (unsigned long)pcd;
	src = (unsigned long)pcs;
	}

	ret = copy_dstaligned(dst, src, len / sizeof(unsigned int),
	o_dst, o_src, o_len);
	if (ret)
	return ret;

	pcs += (len & -sizeof(unsigned int));
	pcd += (len & -sizeof(unsigned int));
	len %= sizeof(unsigned int);

	preserve_branch(handle_load_error);
	preserve_branch(handle_store_error);

	goto byte_copy;

	handle_load_error:
	__asm__ __volatile__ ("pmc_load_exc:\n");
	d = &__get_cpu_var(exception_data);
	DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
	o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
	return o_len - d->fault_addr + o_src;

	handle_store_error:
	__asm__ __volatile__ ("pmc_store_exc:\n");
	d = &__get_cpu_var(exception_data);
	DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
	o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
	return o_len - d->fault_addr + o_dst;
	}

	#ifdef __KERNEL__
	unsigned long copy_to_user(void __user dst, const void src, unsigned long len)
	{
	mtsp(get_kernel_space(), 1);
	mtsp(get_user_space(), 2);
	return pa_memcpy((void __force *)dst, src, len);
	}

	unsigned long copy_from_user(void dst, const void __user src, unsigned long len)
	{
	mtsp(get_user_space(), 1);
	mtsp(get_kernel_space(), 2);
	return pa_memcpy(dst, (void __force *)src, len);
	}

	unsigned long copy_in_user(void __user dst, const void __user src, unsigned long len)
	{
	mtsp(get_user_space(), 1);
	mtsp(get_user_space(), 2);
	return pa_memcpy((void __force )dst, (void __force )src, len);
	}


	void * memcpy(void * dst,const void *src, size_t count)
	{
	mtsp(get_kernel_space(), 1);
	mtsp(get_kernel_space(), 2);
	pa_memcpy(dst, src, count);
	return dst;
	}

	EXPORT_SYMBOL(copy_to_user);
	EXPORT_SYMBOL(copy_from_user);
	EXPORT_SYMBOL(copy_in_user);
	EXPORT_SYMBOL(memcpy);
	#endif