arch/parisc/lib/memcpy.c - linux/kernel/git/viro/vfs - Git at Google

 /*
  *    Optimized memory copy routines.
  *
  *    Copyright (C) 2004 Randolph Chung <tausq@debian.org>
  *    Copyright (C) 2013 Helge Deller <deller@gmx.de>
  *
  *    This program is free software; you can redistribute it and/or modify
  *    it under the terms of the GNU General Public License as published by
  *    the Free Software Foundation; either version 2, or (at your option)
  *    any later version.
  *
  *    This program is distributed in the hope that it will be useful,
  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *    GNU General Public License for more details.
  *
  *    You should have received a copy of the GNU General Public License
  *    along with this program; if not, write to the Free Software
  *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  *    Portions derived from the GNU C Library
  *    Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
  *
  * Several strategies are tried to try to get the best performance for various
  * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
  * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
  * general registers.  Unaligned copies are handled either by aligning the
  * destination and then using shift-and-write method, or in a few cases by
  * falling back to a byte-at-a-time copy.
  *
  * I chose to implement this in C because it is easier to maintain and debug,
  * and in my experiments it appears that the C code generated by gcc (3.3/3.4
  * at the time of writing) is fairly optimal. Unfortunately some of the
  * semantics of the copy routine (exception handling) is difficult to express
  * in C, so we have to play some tricks to get it to work.
  *
  * All the loads and stores are done via explicit asm() code in order to use
  * the right space registers.
  *
  * Testing with various alignments and buffer sizes shows that this code is
  * often >10x faster than a simple byte-at-a-time copy, even for strangely
  * aligned operands. It is interesting to note that the glibc version
  * of memcpy (written in C) is actually quite fast already. This routine is
  * able to beat it by 30-40% for aligned copies because of the loop unrolling,
  * but in some cases the glibc version is still slightly faster. This lends
  * more credibility that gcc can generate very good code as long as we are
  * careful.
  *
  * TODO:
  * - cache prefetching needs more experimentation to get optimal settings
  * - try not to use the post-increment address modifiers; they create additional
  *   interlocks
  * - replace byte-copy loops with stybs sequences
  */

 #ifdef __KERNEL__
 #include <linux/module.h>
 #include <linux/compiler.h>
 #include <linux/uaccess.h>
 #define s_space "%%sr1"
 #define d_space "%%sr2"
 #else
 #include "memcpy.h"
 #define s_space "%%sr0"
 #define d_space "%%sr0"
 #define pa_memcpy new2_copy
 #endif

 DECLARE_PER_CPU(struct exception_data, exception_data);

 #define preserve_branch(label)	do {					\
 	volatile int dummy = 0;						\
 	/* The following branch is never taken, it's just here to  */	\
 	/* prevent gcc from optimizing away our exception code. */ 	\
 	if (unlikely(dummy != dummy))					\
 		goto label;						\
 } while (0)

 #define get_user_space() (uaccess_kernel() ? 0 : mfsp(3))
 #define get_kernel_space() (0)

 #define MERGE(w0, sh_1, w1, sh_2)  ({					\
 	unsigned int _r;						\
 	asm volatile (							\
 	"mtsar %3\n"							\
 	"shrpw %1, %2, %%sar, %0\n"					\
 	: "=r"(_r)							\
 	: "r"(w0), "r"(w1), "r"(sh_2)					\
 	);								\
 	_r;								\
 })
 #define THRESHOLD	16

 #ifdef DEBUG_MEMCPY
 #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
 #else
 #define DPRINTF(fmt, args...)
 #endif

 #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)	\
 	__asm__ __volatile__ (				\
 	"1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t"	\
 	ASM_EXCEPTIONTABLE_ENTRY(1b,_e)			\
 	: _tt(_t), "+r"(_a)				\
 	: 						\
 	: "r8")

 #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) 	\
 	__asm__ __volatile__ (				\
 	"1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t"	\
 	ASM_EXCEPTIONTABLE_ENTRY(1b,_e)			\
 	: "+r"(_a) 					\
 	: _tt(_t)					\
 	: "r8")

 #define load1(_t, _e) def_load_ai_insn(ldbs,1,"=r",s_space,src,_t,_e)
 #define store1(_t, _e) def_store_ai_insn(stbs,1,"r",d_space,dst,_t,_e)
 #define load4(_t, _e) def_load_ai_insn(ldw,4,"=r",s_space,src,_t,_e)
 #define store4(_t, _e) def_store_ai_insn(stw,4,"r",d_space,dst,_t,_e)

 #ifdef  CONFIG_PREFETCH
 static inline void prefetch_src(const void *addr)
 {
 	__asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
 }

 static inline void prefetch_dst(const void *addr)
 {
 	__asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
 }
 #else
 #define prefetch_src(addr) do { } while(0)
 #define prefetch_dst(addr) do { } while(0)
 #endif

 /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
  * per loop.  This code is derived from glibc.
  *
  * Return 0 on success,
  *	  last address stored on load fault
  *	  last address stored + 1 on store fault
  *
  */
 static noinline unsigned long copy_dstaligned(unsigned long dst,
 					unsigned long src, unsigned long len)
 {
 	/* gcc complains that a2 and a3 may be uninitialized, but actually
 	 * they cannot be.  Initialize a2/a3 to shut gcc up.
 	 */
 	register unsigned int a0, a1, a2 = 0, a3 = 0;
 	int sh_1, sh_2;

 	/* prefetch_src((const void *)src); */

 	/* Calculate how to shift a word read at the memory operation
 	   aligned srcp to make it aligned for copy.  */
 	sh_1 = 8 * (src % sizeof(unsigned int));
 	sh_2 = 8 * sizeof(unsigned int) - sh_1;

 	/* Make src aligned by rounding it down.  */
 	src &= -sizeof(unsigned int);

 	switch (len % 4)
 	{
 		case 2:
 			load4(a1, cda_ldw_exc);
 			load4(a2, cda_ldw_exc);
 			len += 2;
 			goto do1;
 		case 3:
 			load4(a0, cda_ldw_exc);
 			load4(a1, cda_ldw_exc);
 			len += 1;
 			goto do2;
 		case 0:
 			if (len == 0)
 				return 0;
 			load4(a3, cda_ldw_exc);
 			load4(a0, cda_ldw_exc);
 			len += 0;
 			goto do3;
 		case 1:
 			load4(a2, cda_ldw_exc);
 			load4(a3, cda_ldw_exc);
 			len -= 1;
 			if (len == 0)
 				goto do0;
 			goto do4;			/* No-op.  */
 	}

 	do
 	{
 		/* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
 do4:
 		load4(a0, cda_ldw_exc);
 		store4(MERGE (a2, sh_1, a3, sh_2), cda_stw_exc);
 do3:
 		load4(a1, cda_ldw_exc);
 		store4(MERGE (a3, sh_1, a0, sh_2), cda_stw_exc);
 do2:
 		load4(a2, cda_ldw_exc);
 		store4(MERGE (a0, sh_1, a1, sh_2), cda_stw_exc);
 do1:
 		load4(a3, cda_ldw_exc);
 		store4(MERGE (a1, sh_1, a2, sh_2), cda_stw_exc);

 		len -= 4;
 	}
 	while (len != 0);

 do0:
 	/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
 	store4(MERGE (a2, sh_1, a3, sh_2), cda_stw_exc);

 	preserve_branch(handle_load_error);
 	preserve_branch(handle_store_error);

 	return 0;

 handle_load_error:
 	__asm__ __volatile__ ("cda_ldw_exc:\n");
 	return dst;

 handle_store_error:
 	__asm__ __volatile__ ("cda_stw_exc:\n");
 	return dst + 1;
 }


 /* Returns 0 for success, otherwise, returns number of bytes not transferred. */
 static noinline unsigned long pa_memcpy(void *dstp, const void *srcp,
 					unsigned long len)
 {
 	register unsigned long src, dst, t1, t2, t3;
 	unsigned long ret, end;

 	src = (unsigned long)srcp;
 	dst = (unsigned long)dstp;
 	end = dst + len;
 	asm volatile("": : "r"(end));

 	/* prefetch_src((const void *)srcp); */

 	if (len < THRESHOLD)
 		goto byte_copy;

 	/* Check alignment */
 	t1 = (src ^ dst);
 	if (unlikely(t1 & (sizeof(unsigned int)-1)))
 		goto unaligned_copy;

 	/* src and dst have same alignment. */

 	/* Copy bytes till we are double-aligned. */
 	t2 = src & (sizeof(unsigned int) - 1);
 	if (unlikely(t2 != 0)) {
 		t2 = sizeof(unsigned int) - t2;
 		while (t2 && len) {
 			load1(t3, pmc_done);
 			len--;
 			store1(t3, pmc_done);
 			t2--;
 		}
 	}

 	while (len >= 8*sizeof(unsigned int)) {
 		register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
 		/* prefetch_src((char *)src + L1_CACHE_BYTES); */
 		load4(r1, pmc_done);
 		load4(r2, pmc_byte4);		// bytecopy src-4
 		load4(r3, pmc_byte8);		// bytecopy src-8
 		load4(r4, pmc_byte12);		// bytecopy src-12
 		store4(r1, pmc_done);
 		store4(r2, pmc_done);
 		store4(r3, pmc_done);
 		store4(r4, pmc_done);

 		load4(r5, pmc_done);
 		load4(r6, pmc_byte4);		// bytecopy src-4
 		load4(r7, pmc_byte8);		// bytecopy src-8
 		load4(r8, pmc_byte12);		// bytecopy src-12
 		store4(r5, pmc_done);
 		store4(r6, pmc_done);
 		store4(r7, pmc_done);
 		store4(r8, pmc_done);
 		len -= 8*sizeof(unsigned int);
 	}

 	while (len >= 4*sizeof(unsigned int)) {
 		register unsigned int r1,r2,r3,r4;
 		load4(r1, pmc_done);
 		load4(r2, pmc_byte4);		// bytecopy src-4
 		load4(r3, pmc_byte8);		// bytecopy src-8
 		load4(r4, pmc_byte12);		// bytecopy src-12
 		store4(r1, pmc_done);
 		store4(r2, pmc_done);
 		store4(r3, pmc_done);
 		store4(r4, pmc_done);
 		len -= 4*sizeof(unsigned int);
 	}

 byte_copy:
 	while (len) {
 		load1(t3, pmc_done);
 		store1(t3, pmc_done);
 		len--;
 	}

 	return 0;

 unaligned_copy:
 	if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
 		t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
 		while (t2) {
 			load1(t3, pmc_done);
 			store1(t3, pmc_done);
 			len--;
 			t2--;
 		}
 	}

 	ret = copy_dstaligned(dst, src, len / sizeof(unsigned int));
 	if (unlikely(ret)) {
 		if (ret & 1)
 			return len - (ret - 1 - dst);
 		src += ret - dst;
 		len -= ret - dst;
 		dst = ret;
 		goto byte_copy;
 	}

 	src += (len & -sizeof(unsigned int));
 	dst += (len & -sizeof(unsigned int));
 	len %= sizeof(unsigned int);

 	preserve_branch(handle_error1);
 	preserve_branch(handle_error2);
 	preserve_branch(handle_error3);
 	preserve_branch(handle_error4);
 	goto byte_copy;

 handle_error1:
 	__asm__ __volatile__ ("pmc_done:\n");
 	return end - dst;
 handle_error2:
 	__asm__ __volatile__ ("pmc_byte4:\n");
 	src -= 4;
 	goto byte_copy;
 handle_error3:
 	__asm__ __volatile__ ("pmc_byte8:\n");
 	src -= 8;
 	goto byte_copy;
 handle_error4:
 	__asm__ __volatile__ ("pmc_byte12:\n");
 	src -= 12;
 	goto byte_copy;
 }

 #ifdef __KERNEL__
 unsigned long raw_copy_to_user(void __user *dst, const void *src,
 			       unsigned long len)
 {
 	mtsp(get_kernel_space(), 1);
 	mtsp(get_user_space(), 2);
 	return pa_memcpy((void __force *)dst, src, len);
 }
 EXPORT_SYMBOL(raw_copy_to_user);

 unsigned long raw_copy_from_user(void *dst, const void __user *src,
 			       unsigned long len)
 {
 	mtsp(get_user_space(), 1);
 	mtsp(get_kernel_space(), 2);
 	return pa_memcpy(dst, (void __force *)src, len);
 }
 EXPORT_SYMBOL(raw_copy_from_user);

 unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long len)
 {
 	mtsp(get_user_space(), 1);
 	mtsp(get_user_space(), 2);
 	return pa_memcpy((void __force *)dst, (void __force *)src, len);
 }


 void * memcpy(void * dst,const void *src, size_t count)
 {
 	mtsp(get_kernel_space(), 1);
 	mtsp(get_kernel_space(), 2);
 	pa_memcpy(dst, src, count);
 	return dst;
 }

 EXPORT_SYMBOL(raw_copy_in_user);
 EXPORT_SYMBOL(memcpy);

 long probe_kernel_read(void *dst, const void *src, size_t size)
 {
 	unsigned long addr = (unsigned long)src;

 	if (addr < PAGE_SIZE)
 		return -EFAULT;

 	/* check for I/O space F_EXTEND(0xfff00000) access as well? */

 	return __probe_kernel_read(dst, src, size);
 }

 #endif
	/*
	* Optimized memory copy routines.
	*
	* Copyright (C) 2004 Randolph Chung <tausq@debian.org>
	* Copyright (C) 2013 Helge Deller <deller@gmx.de>
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 2, or (at your option)
	* any later version.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	*
	* Portions derived from the GNU C Library
	* Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
	*
	* Several strategies are tried to try to get the best performance for various
	* conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
	* fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
	* general registers. Unaligned copies are handled either by aligning the
	* destination and then using shift-and-write method, or in a few cases by
	* falling back to a byte-at-a-time copy.
	*
	* I chose to implement this in C because it is easier to maintain and debug,
	* and in my experiments it appears that the C code generated by gcc (3.3/3.4
	* at the time of writing) is fairly optimal. Unfortunately some of the
	* semantics of the copy routine (exception handling) is difficult to express
	* in C, so we have to play some tricks to get it to work.
	*
	* All the loads and stores are done via explicit asm() code in order to use
	* the right space registers.
	*
	* Testing with various alignments and buffer sizes shows that this code is
	* often >10x faster than a simple byte-at-a-time copy, even for strangely
	* aligned operands. It is interesting to note that the glibc version
	* of memcpy (written in C) is actually quite fast already. This routine is
	* able to beat it by 30-40% for aligned copies because of the loop unrolling,
	* but in some cases the glibc version is still slightly faster. This lends
	* more credibility that gcc can generate very good code as long as we are
	* careful.
	*
	* TODO:
	* - cache prefetching needs more experimentation to get optimal settings
	* - try not to use the post-increment address modifiers; they create additional
	* interlocks
	* - replace byte-copy loops with stybs sequences
	*/

	#ifdef __KERNEL__
	#include <linux/module.h>
	#include <linux/compiler.h>
	#include <linux/uaccess.h>
	#define s_space "%%sr1"
	#define d_space "%%sr2"
	#else
	#include "memcpy.h"
	#define s_space "%%sr0"
	#define d_space "%%sr0"
	#define pa_memcpy new2_copy
	#endif

	DECLARE_PER_CPU(struct exception_data, exception_data);

	#define preserve_branch(label) do { \
	volatile int dummy = 0; \
	/* The following branch is never taken, it's just here to */ \
	/* prevent gcc from optimizing away our exception code. */ \
	if (unlikely(dummy != dummy)) \
	goto label; \
	} while (0)

	#define get_user_space() (uaccess_kernel() ? 0 : mfsp(3))
	#define get_kernel_space() (0)

	#define MERGE(w0, sh_1, w1, sh_2) ({ \
	unsigned int _r; \
	asm volatile ( \
	"mtsar %3\n" \
	"shrpw %1, %2, %%sar, %0\n" \
	: "=r"(_r) \
	: "r"(w0), "r"(w1), "r"(sh_2) \
	); \
	_r; \
	})
	#define THRESHOLD 16

	#ifdef DEBUG_MEMCPY
	#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
	#else
	#define DPRINTF(fmt, args...)
	#endif

	#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
	__asm__ __volatile__ ( \
	"1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \
	ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
	: _tt(_t), "+r"(_a) \
	: \
	: "r8")

	#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
	__asm__ __volatile__ ( \
	"1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \
	ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
	: "+r"(_a) \
	: _tt(_t) \
	: "r8")

	#define load1(_t, _e) def_load_ai_insn(ldbs,1,"=r",s_space,src,_t,_e)
	#define store1(_t, _e) def_store_ai_insn(stbs,1,"r",d_space,dst,_t,_e)
	#define load4(_t, _e) def_load_ai_insn(ldw,4,"=r",s_space,src,_t,_e)
	#define store4(_t, _e) def_store_ai_insn(stw,4,"r",d_space,dst,_t,_e)

	#ifdef CONFIG_PREFETCH
	static inline void prefetch_src(const void *addr)
	{
	__asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
	}

	static inline void prefetch_dst(const void *addr)
	{
	__asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
	}
	#else
	#define prefetch_src(addr) do { } while(0)
	#define prefetch_dst(addr) do { } while(0)
	#endif

	/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
	* per loop. This code is derived from glibc.
	*
	* Return 0 on success,
	* last address stored on load fault
	* last address stored + 1 on store fault
	*
	*/
	static noinline unsigned long copy_dstaligned(unsigned long dst,
	unsigned long src, unsigned long len)
	{
	/* gcc complains that a2 and a3 may be uninitialized, but actually
	* they cannot be. Initialize a2/a3 to shut gcc up.
	*/
	register unsigned int a0, a1, a2 = 0, a3 = 0;
	int sh_1, sh_2;

	/* prefetch_src((const void )src); /

	/* Calculate how to shift a word read at the memory operation
	aligned srcp to make it aligned for copy. */
	sh_1 = 8 * (src % sizeof(unsigned int));
	sh_2 = 8 * sizeof(unsigned int) - sh_1;

	/* Make src aligned by rounding it down. */
	src &= -sizeof(unsigned int);

	switch (len % 4)
	{
	case 2:
	load4(a1, cda_ldw_exc);
	load4(a2, cda_ldw_exc);
	len += 2;
	goto do1;
	case 3:
	load4(a0, cda_ldw_exc);
	load4(a1, cda_ldw_exc);
	len += 1;
	goto do2;
	case 0:
	if (len == 0)
	return 0;
	load4(a3, cda_ldw_exc);
	load4(a0, cda_ldw_exc);
	len += 0;
	goto do3;
	case 1:
	load4(a2, cda_ldw_exc);
	load4(a3, cda_ldw_exc);
	len -= 1;
	if (len == 0)
	goto do0;
	goto do4; /* No-op. */
	}

	do
	{
	/* prefetch_src((const void )(src + 4 sizeof(unsigned int))); */
	do4:
	load4(a0, cda_ldw_exc);
	store4(MERGE (a2, sh_1, a3, sh_2), cda_stw_exc);
	do3:
	load4(a1, cda_ldw_exc);
	store4(MERGE (a3, sh_1, a0, sh_2), cda_stw_exc);
	do2:
	load4(a2, cda_ldw_exc);
	store4(MERGE (a0, sh_1, a1, sh_2), cda_stw_exc);
	do1:
	load4(a3, cda_ldw_exc);
	store4(MERGE (a1, sh_1, a2, sh_2), cda_stw_exc);

	len -= 4;
	}
	while (len != 0);

	do0:
	/* ((unsigned int ) dst)[0] = MERGE (a2, sh_1, a3, sh_2); /
	store4(MERGE (a2, sh_1, a3, sh_2), cda_stw_exc);

	preserve_branch(handle_load_error);
	preserve_branch(handle_store_error);

	return 0;

	handle_load_error:
	__asm__ __volatile__ ("cda_ldw_exc:\n");
	return dst;

	handle_store_error:
	__asm__ __volatile__ ("cda_stw_exc:\n");
	return dst + 1;
	}


	/* Returns 0 for success, otherwise, returns number of bytes not transferred. */
	static noinline unsigned long pa_memcpy(void dstp, const void srcp,
	unsigned long len)
	{
	register unsigned long src, dst, t1, t2, t3;
	unsigned long ret, end;

	src = (unsigned long)srcp;
	dst = (unsigned long)dstp;
	end = dst + len;
	asm volatile("": : "r"(end));

	/* prefetch_src((const void )srcp); /

	if (len < THRESHOLD)
	goto byte_copy;

	/* Check alignment */
	t1 = (src ^ dst);
	if (unlikely(t1 & (sizeof(unsigned int)-1)))
	goto unaligned_copy;

	/* src and dst have same alignment. */

	/* Copy bytes till we are double-aligned. */
	t2 = src & (sizeof(unsigned int) - 1);
	if (unlikely(t2 != 0)) {
	t2 = sizeof(unsigned int) - t2;
	while (t2 && len) {
	load1(t3, pmc_done);
	len--;
	store1(t3, pmc_done);
	t2--;
	}
	}

	while (len >= 8*sizeof(unsigned int)) {
	register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
	/* prefetch_src((char )src + L1_CACHE_BYTES); /
	load4(r1, pmc_done);
	load4(r2, pmc_byte4); // bytecopy src-4
	load4(r3, pmc_byte8); // bytecopy src-8
	load4(r4, pmc_byte12); // bytecopy src-12
	store4(r1, pmc_done);
	store4(r2, pmc_done);
	store4(r3, pmc_done);
	store4(r4, pmc_done);

	load4(r5, pmc_done);
	load4(r6, pmc_byte4); // bytecopy src-4
	load4(r7, pmc_byte8); // bytecopy src-8
	load4(r8, pmc_byte12); // bytecopy src-12
	store4(r5, pmc_done);
	store4(r6, pmc_done);
	store4(r7, pmc_done);
	store4(r8, pmc_done);
	len -= 8*sizeof(unsigned int);
	}

	while (len >= 4*sizeof(unsigned int)) {
	register unsigned int r1,r2,r3,r4;
	load4(r1, pmc_done);
	load4(r2, pmc_byte4); // bytecopy src-4
	load4(r3, pmc_byte8); // bytecopy src-8
	load4(r4, pmc_byte12); // bytecopy src-12
	store4(r1, pmc_done);
	store4(r2, pmc_done);
	store4(r3, pmc_done);
	store4(r4, pmc_done);
	len -= 4*sizeof(unsigned int);
	}

	byte_copy:
	while (len) {
	load1(t3, pmc_done);
	store1(t3, pmc_done);
	len--;
	}

	return 0;

	unaligned_copy:
	if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
	t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
	while (t2) {
	load1(t3, pmc_done);
	store1(t3, pmc_done);
	len--;
	t2--;
	}
	}

	ret = copy_dstaligned(dst, src, len / sizeof(unsigned int));
	if (unlikely(ret)) {
	if (ret & 1)
	return len - (ret - 1 - dst);
	src += ret - dst;
	len -= ret - dst;
	dst = ret;
	goto byte_copy;
	}

	src += (len & -sizeof(unsigned int));
	dst += (len & -sizeof(unsigned int));
	len %= sizeof(unsigned int);

	preserve_branch(handle_error1);
	preserve_branch(handle_error2);
	preserve_branch(handle_error3);
	preserve_branch(handle_error4);
	goto byte_copy;

	handle_error1:
	__asm__ __volatile__ ("pmc_done:\n");
	return end - dst;
	handle_error2:
	__asm__ __volatile__ ("pmc_byte4:\n");
	src -= 4;
	goto byte_copy;
	handle_error3:
	__asm__ __volatile__ ("pmc_byte8:\n");
	src -= 8;
	goto byte_copy;
	handle_error4:
	__asm__ __volatile__ ("pmc_byte12:\n");
	src -= 12;
	goto byte_copy;
	}

	#ifdef __KERNEL__
	unsigned long raw_copy_to_user(void __user dst, const void src,
	unsigned long len)
	{
	mtsp(get_kernel_space(), 1);
	mtsp(get_user_space(), 2);
	return pa_memcpy((void __force *)dst, src, len);
	}
	EXPORT_SYMBOL(raw_copy_to_user);

	unsigned long raw_copy_from_user(void dst, const void __user src,
	unsigned long len)
	{
	mtsp(get_user_space(), 1);
	mtsp(get_kernel_space(), 2);
	return pa_memcpy(dst, (void __force *)src, len);
	}
	EXPORT_SYMBOL(raw_copy_from_user);

	unsigned long raw_copy_in_user(void __user dst, const void __user src, unsigned long len)
	{
	mtsp(get_user_space(), 1);
	mtsp(get_user_space(), 2);
	return pa_memcpy((void __force )dst, (void __force )src, len);
	}


	void * memcpy(void * dst,const void *src, size_t count)
	{
	mtsp(get_kernel_space(), 1);
	mtsp(get_kernel_space(), 2);
	pa_memcpy(dst, src, count);
	return dst;
	}

	EXPORT_SYMBOL(raw_copy_in_user);
	EXPORT_SYMBOL(memcpy);

	long probe_kernel_read(void dst, const void src, size_t size)
	{
	unsigned long addr = (unsigned long)src;

	if (addr < PAGE_SIZE)
	return -EFAULT;

	/* check for I/O space F_EXTEND(0xfff00000) access as well? */

	return __probe_kernel_read(dst, src, size);
	}

	#endif