blob: 7ce92823d77d051d1de66bb4ca990b97a18e0518 [file] [log] [blame]
/*
* Optimized memory copy routines.
*
* Copyright (C) 2004 Randolph Chung <tausq@debian.org>
* Copyright (C) 2013 Helge Deller <deller@gmx.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Portions derived from the GNU C Library
* Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
*
* Several strategies are tried to try to get the best performance for various
* conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
* fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
* general registers. Unaligned copies are handled either by aligning the
* destination and then using shift-and-write method, or in a few cases by
* falling back to a byte-at-a-time copy.
*
* I chose to implement this in C because it is easier to maintain and debug,
* and in my experiments it appears that the C code generated by gcc (3.3/3.4
* at the time of writing) is fairly optimal. Unfortunately some of the
* semantics of the copy routine (exception handling) is difficult to express
* in C, so we have to play some tricks to get it to work.
*
* All the loads and stores are done via explicit asm() code in order to use
* the right space registers.
*
* Testing with various alignments and buffer sizes shows that this code is
* often >10x faster than a simple byte-at-a-time copy, even for strangely
* aligned operands. It is interesting to note that the glibc version
* of memcpy (written in C) is actually quite fast already. This routine is
* able to beat it by 30-40% for aligned copies because of the loop unrolling,
* but in some cases the glibc version is still slightly faster. This lends
* more credibility that gcc can generate very good code as long as we are
* careful.
*
* TODO:
* - cache prefetching needs more experimentation to get optimal settings
* - try not to use the post-increment address modifiers; they create additional
* interlocks
* - replace byte-copy loops with stybs sequences
*/
#ifdef __KERNEL__
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/uaccess.h>
#define s_space "%%sr1"
#define d_space "%%sr2"
#else
#include "memcpy.h"
#define s_space "%%sr0"
#define d_space "%%sr0"
#define pa_memcpy new2_copy
#endif
DECLARE_PER_CPU(struct exception_data, exception_data);
#define preserve_branch(label) do { \
volatile int dummy = 0; \
/* The following branch is never taken, it's just here to */ \
/* prevent gcc from optimizing away our exception code. */ \
if (unlikely(dummy != dummy)) \
goto label; \
} while (0)
#define get_user_space() (uaccess_kernel() ? 0 : mfsp(3))
#define get_kernel_space() (0)
#define MERGE(w0, sh_1, w1, sh_2) ({ \
unsigned int _r; \
asm volatile ( \
"mtsar %3\n" \
"shrpw %1, %2, %%sar, %0\n" \
: "=r"(_r) \
: "r"(w0), "r"(w1), "r"(sh_2) \
); \
_r; \
})
#define THRESHOLD 16
#ifdef DEBUG_MEMCPY
#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
#else
#define DPRINTF(fmt, args...)
#endif
#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
__asm__ __volatile__ ( \
"1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \
ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
: _tt(_t), "+r"(_a) \
: \
: "r8")
#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
__asm__ __volatile__ ( \
"1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \
ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
: "+r"(_a) \
: _tt(_t) \
: "r8")
#define load1(_t, _e) def_load_ai_insn(ldbs,1,"=r",s_space,src,_t,_e)
#define store1(_t, _e) def_store_ai_insn(stbs,1,"r",d_space,dst,_t,_e)
#define load4(_t, _e) def_load_ai_insn(ldw,4,"=r",s_space,src,_t,_e)
#define store4(_t, _e) def_store_ai_insn(stw,4,"r",d_space,dst,_t,_e)
#ifdef CONFIG_PREFETCH
static inline void prefetch_src(const void *addr)
{
__asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
}
static inline void prefetch_dst(const void *addr)
{
__asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
}
#else
#define prefetch_src(addr) do { } while(0)
#define prefetch_dst(addr) do { } while(0)
#endif
/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
* per loop. This code is derived from glibc.
*
* Return 0 on success,
* last address stored on load fault
* last address stored + 1 on store fault
*
*/
static noinline unsigned long copy_dstaligned(unsigned long dst,
unsigned long src, unsigned long len)
{
/* gcc complains that a2 and a3 may be uninitialized, but actually
* they cannot be. Initialize a2/a3 to shut gcc up.
*/
register unsigned int a0, a1, a2 = 0, a3 = 0;
int sh_1, sh_2;
/* prefetch_src((const void *)src); */
/* Calculate how to shift a word read at the memory operation
aligned srcp to make it aligned for copy. */
sh_1 = 8 * (src % sizeof(unsigned int));
sh_2 = 8 * sizeof(unsigned int) - sh_1;
/* Make src aligned by rounding it down. */
src &= -sizeof(unsigned int);
switch (len % 4)
{
case 2:
load4(a1, cda_ldw_exc);
load4(a2, cda_ldw_exc);
len += 2;
goto do1;
case 3:
load4(a0, cda_ldw_exc);
load4(a1, cda_ldw_exc);
len += 1;
goto do2;
case 0:
if (len == 0)
return 0;
load4(a3, cda_ldw_exc);
load4(a0, cda_ldw_exc);
len += 0;
goto do3;
case 1:
load4(a2, cda_ldw_exc);
load4(a3, cda_ldw_exc);
len -= 1;
if (len == 0)
goto do0;
goto do4; /* No-op. */
}
do
{
/* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
do4:
load4(a0, cda_ldw_exc);
store4(MERGE (a2, sh_1, a3, sh_2), cda_stw_exc);
do3:
load4(a1, cda_ldw_exc);
store4(MERGE (a3, sh_1, a0, sh_2), cda_stw_exc);
do2:
load4(a2, cda_ldw_exc);
store4(MERGE (a0, sh_1, a1, sh_2), cda_stw_exc);
do1:
load4(a3, cda_ldw_exc);
store4(MERGE (a1, sh_1, a2, sh_2), cda_stw_exc);
len -= 4;
}
while (len != 0);
do0:
/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
store4(MERGE (a2, sh_1, a3, sh_2), cda_stw_exc);
preserve_branch(handle_load_error);
preserve_branch(handle_store_error);
return 0;
handle_load_error:
__asm__ __volatile__ ("cda_ldw_exc:\n");
return dst;
handle_store_error:
__asm__ __volatile__ ("cda_stw_exc:\n");
return dst + 1;
}
/* Returns 0 for success, otherwise, returns number of bytes not transferred. */
static noinline unsigned long pa_memcpy(void *dstp, const void *srcp,
unsigned long len)
{
register unsigned long src, dst, t1, t2, t3;
unsigned long ret, end;
src = (unsigned long)srcp;
dst = (unsigned long)dstp;
end = dst + len;
asm volatile("": : "r"(end));
/* prefetch_src((const void *)srcp); */
if (len < THRESHOLD)
goto byte_copy;
/* Check alignment */
t1 = (src ^ dst);
if (unlikely(t1 & (sizeof(unsigned int)-1)))
goto unaligned_copy;
/* src and dst have same alignment. */
/* Copy bytes till we are double-aligned. */
t2 = src & (sizeof(unsigned int) - 1);
if (unlikely(t2 != 0)) {
t2 = sizeof(unsigned int) - t2;
while (t2 && len) {
load1(t3, pmc_done);
len--;
store1(t3, pmc_done);
t2--;
}
}
while (len >= 8*sizeof(unsigned int)) {
register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
/* prefetch_src((char *)src + L1_CACHE_BYTES); */
load4(r1, pmc_done);
load4(r2, pmc_byte4); // bytecopy src-4
load4(r3, pmc_byte8); // bytecopy src-8
load4(r4, pmc_byte12); // bytecopy src-12
store4(r1, pmc_done);
store4(r2, pmc_done);
store4(r3, pmc_done);
store4(r4, pmc_done);
load4(r5, pmc_done);
load4(r6, pmc_byte4); // bytecopy src-4
load4(r7, pmc_byte8); // bytecopy src-8
load4(r8, pmc_byte12); // bytecopy src-12
store4(r5, pmc_done);
store4(r6, pmc_done);
store4(r7, pmc_done);
store4(r8, pmc_done);
len -= 8*sizeof(unsigned int);
}
while (len >= 4*sizeof(unsigned int)) {
register unsigned int r1,r2,r3,r4;
load4(r1, pmc_done);
load4(r2, pmc_byte4); // bytecopy src-4
load4(r3, pmc_byte8); // bytecopy src-8
load4(r4, pmc_byte12); // bytecopy src-12
store4(r1, pmc_done);
store4(r2, pmc_done);
store4(r3, pmc_done);
store4(r4, pmc_done);
len -= 4*sizeof(unsigned int);
}
byte_copy:
while (len) {
load1(t3, pmc_done);
store1(t3, pmc_done);
len--;
}
return 0;
unaligned_copy:
if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
while (t2) {
load1(t3, pmc_done);
store1(t3, pmc_done);
len--;
t2--;
}
}
ret = copy_dstaligned(dst, src, len / sizeof(unsigned int));
if (unlikely(ret)) {
if (ret & 1)
return len - (ret - 1 - dst);
src += ret - dst;
len -= ret - dst;
dst = ret;
goto byte_copy;
}
src += (len & -sizeof(unsigned int));
dst += (len & -sizeof(unsigned int));
len %= sizeof(unsigned int);
preserve_branch(handle_error1);
preserve_branch(handle_error2);
preserve_branch(handle_error3);
preserve_branch(handle_error4);
goto byte_copy;
handle_error1:
__asm__ __volatile__ ("pmc_done:\n");
return end - dst;
handle_error2:
__asm__ __volatile__ ("pmc_byte4:\n");
src -= 4;
goto byte_copy;
handle_error3:
__asm__ __volatile__ ("pmc_byte8:\n");
src -= 8;
goto byte_copy;
handle_error4:
__asm__ __volatile__ ("pmc_byte12:\n");
src -= 12;
goto byte_copy;
}
#ifdef __KERNEL__
unsigned long raw_copy_to_user(void __user *dst, const void *src,
unsigned long len)
{
mtsp(get_kernel_space(), 1);
mtsp(get_user_space(), 2);
return pa_memcpy((void __force *)dst, src, len);
}
EXPORT_SYMBOL(raw_copy_to_user);
unsigned long raw_copy_from_user(void *dst, const void __user *src,
unsigned long len)
{
mtsp(get_user_space(), 1);
mtsp(get_kernel_space(), 2);
return pa_memcpy(dst, (void __force *)src, len);
}
EXPORT_SYMBOL(raw_copy_from_user);
unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long len)
{
mtsp(get_user_space(), 1);
mtsp(get_user_space(), 2);
return pa_memcpy((void __force *)dst, (void __force *)src, len);
}
void * memcpy(void * dst,const void *src, size_t count)
{
mtsp(get_kernel_space(), 1);
mtsp(get_kernel_space(), 2);
pa_memcpy(dst, src, count);
return dst;
}
EXPORT_SYMBOL(raw_copy_in_user);
EXPORT_SYMBOL(memcpy);
long probe_kernel_read(void *dst, const void *src, size_t size)
{
unsigned long addr = (unsigned long)src;
if (addr < PAGE_SIZE)
return -EFAULT;
/* check for I/O space F_EXTEND(0xfff00000) access as well? */
return __probe_kernel_read(dst, src, size);
}
#endif