| /* |
| * Optimized memory copy routines. |
| * |
| * Copyright (C) 2004 Randolph Chung <tausq@debian.org> |
| * Copyright (C) 2013 Helge Deller <deller@gmx.de> |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License as published by |
| * the Free Software Foundation; either version 2, or (at your option) |
| * any later version. |
| * |
| * This program is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU General Public License for more details. |
| * |
| * You should have received a copy of the GNU General Public License |
| * along with this program; if not, write to the Free Software |
| * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| * |
| * Portions derived from the GNU C Library |
| * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc. |
| * |
| * Several strategies are tried to try to get the best performance for various |
| * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using |
| * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using |
| * general registers. Unaligned copies are handled either by aligning the |
| * destination and then using shift-and-write method, or in a few cases by |
| * falling back to a byte-at-a-time copy. |
| * |
| * I chose to implement this in C because it is easier to maintain and debug, |
| * and in my experiments it appears that the C code generated by gcc (3.3/3.4 |
| * at the time of writing) is fairly optimal. Unfortunately some of the |
| * semantics of the copy routine (exception handling) is difficult to express |
| * in C, so we have to play some tricks to get it to work. |
| * |
| * All the loads and stores are done via explicit asm() code in order to use |
| * the right space registers. |
| * |
| * Testing with various alignments and buffer sizes shows that this code is |
| * often >10x faster than a simple byte-at-a-time copy, even for strangely |
| * aligned operands. It is interesting to note that the glibc version |
| * of memcpy (written in C) is actually quite fast already. This routine is |
| * able to beat it by 30-40% for aligned copies because of the loop unrolling, |
| * but in some cases the glibc version is still slightly faster. This lends |
| * more credibility that gcc can generate very good code as long as we are |
| * careful. |
| * |
| * TODO: |
| * - cache prefetching needs more experimentation to get optimal settings |
| * - try not to use the post-increment address modifiers; they create additional |
| * interlocks |
| * - replace byte-copy loops with stybs sequences |
| */ |
| |
| #ifdef __KERNEL__ |
| #include <linux/module.h> |
| #include <linux/compiler.h> |
| #include <linux/uaccess.h> |
| #define s_space "%%sr1" |
| #define d_space "%%sr2" |
| #else |
| #include "memcpy.h" |
| #define s_space "%%sr0" |
| #define d_space "%%sr0" |
| #define pa_memcpy new2_copy |
| #endif |
| |
| DECLARE_PER_CPU(struct exception_data, exception_data); |
| |
| #define preserve_branch(label) do { \ |
| volatile int dummy = 0; \ |
| /* The following branch is never taken, it's just here to */ \ |
| /* prevent gcc from optimizing away our exception code. */ \ |
| if (unlikely(dummy != dummy)) \ |
| goto label; \ |
| } while (0) |
| |
| #define get_user_space() (uaccess_kernel() ? 0 : mfsp(3)) |
| #define get_kernel_space() (0) |
| |
| #define MERGE(w0, sh_1, w1, sh_2) ({ \ |
| unsigned int _r; \ |
| asm volatile ( \ |
| "mtsar %3\n" \ |
| "shrpw %1, %2, %%sar, %0\n" \ |
| : "=r"(_r) \ |
| : "r"(w0), "r"(w1), "r"(sh_2) \ |
| ); \ |
| _r; \ |
| }) |
| #define THRESHOLD 16 |
| |
| #ifdef DEBUG_MEMCPY |
| #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0) |
| #else |
| #define DPRINTF(fmt, args...) |
| #endif |
| |
| #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ |
| __asm__ __volatile__ ( \ |
| "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \ |
| ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ |
| : _tt(_t), "+r"(_a) \ |
| : \ |
| : "r8") |
| |
| #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ |
| __asm__ __volatile__ ( \ |
| "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \ |
| ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ |
| : "+r"(_a) \ |
| : _tt(_t) \ |
| : "r8") |
| |
| #define load1(_t, _e) def_load_ai_insn(ldbs,1,"=r",s_space,src,_t,_e) |
| #define store1(_t, _e) def_store_ai_insn(stbs,1,"r",d_space,dst,_t,_e) |
| #define load4(_t, _e) def_load_ai_insn(ldw,4,"=r",s_space,src,_t,_e) |
| #define store4(_t, _e) def_store_ai_insn(stw,4,"r",d_space,dst,_t,_e) |
| |
| #ifdef CONFIG_PREFETCH |
| static inline void prefetch_src(const void *addr) |
| { |
| __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr)); |
| } |
| |
| static inline void prefetch_dst(const void *addr) |
| { |
| __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr)); |
| } |
| #else |
| #define prefetch_src(addr) do { } while(0) |
| #define prefetch_dst(addr) do { } while(0) |
| #endif |
| |
| /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words |
| * per loop. This code is derived from glibc. |
| * |
| * Return 0 on success, |
| * last address stored on load fault |
| * last address stored + 1 on store fault |
| * |
| */ |
| static noinline unsigned long copy_dstaligned(unsigned long dst, |
| unsigned long src, unsigned long len) |
| { |
| /* gcc complains that a2 and a3 may be uninitialized, but actually |
| * they cannot be. Initialize a2/a3 to shut gcc up. |
| */ |
| register unsigned int a0, a1, a2 = 0, a3 = 0; |
| int sh_1, sh_2; |
| |
| /* prefetch_src((const void *)src); */ |
| |
| /* Calculate how to shift a word read at the memory operation |
| aligned srcp to make it aligned for copy. */ |
| sh_1 = 8 * (src % sizeof(unsigned int)); |
| sh_2 = 8 * sizeof(unsigned int) - sh_1; |
| |
| /* Make src aligned by rounding it down. */ |
| src &= -sizeof(unsigned int); |
| |
| switch (len % 4) |
| { |
| case 2: |
| load4(a1, cda_ldw_exc); |
| load4(a2, cda_ldw_exc); |
| len += 2; |
| goto do1; |
| case 3: |
| load4(a0, cda_ldw_exc); |
| load4(a1, cda_ldw_exc); |
| len += 1; |
| goto do2; |
| case 0: |
| if (len == 0) |
| return 0; |
| load4(a3, cda_ldw_exc); |
| load4(a0, cda_ldw_exc); |
| len += 0; |
| goto do3; |
| case 1: |
| load4(a2, cda_ldw_exc); |
| load4(a3, cda_ldw_exc); |
| len -= 1; |
| if (len == 0) |
| goto do0; |
| goto do4; /* No-op. */ |
| } |
| |
| do |
| { |
| /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */ |
| do4: |
| load4(a0, cda_ldw_exc); |
| store4(MERGE (a2, sh_1, a3, sh_2), cda_stw_exc); |
| do3: |
| load4(a1, cda_ldw_exc); |
| store4(MERGE (a3, sh_1, a0, sh_2), cda_stw_exc); |
| do2: |
| load4(a2, cda_ldw_exc); |
| store4(MERGE (a0, sh_1, a1, sh_2), cda_stw_exc); |
| do1: |
| load4(a3, cda_ldw_exc); |
| store4(MERGE (a1, sh_1, a2, sh_2), cda_stw_exc); |
| |
| len -= 4; |
| } |
| while (len != 0); |
| |
| do0: |
| /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */ |
| store4(MERGE (a2, sh_1, a3, sh_2), cda_stw_exc); |
| |
| preserve_branch(handle_load_error); |
| preserve_branch(handle_store_error); |
| |
| return 0; |
| |
| handle_load_error: |
| __asm__ __volatile__ ("cda_ldw_exc:\n"); |
| return dst; |
| |
| handle_store_error: |
| __asm__ __volatile__ ("cda_stw_exc:\n"); |
| return dst + 1; |
| } |
| |
| |
| /* Returns 0 for success, otherwise, returns number of bytes not transferred. */ |
| static noinline unsigned long pa_memcpy(void *dstp, const void *srcp, |
| unsigned long len) |
| { |
| register unsigned long src, dst, t1, t2, t3; |
| unsigned long ret, end; |
| |
| src = (unsigned long)srcp; |
| dst = (unsigned long)dstp; |
| end = dst + len; |
| asm volatile("": : "r"(end)); |
| |
| /* prefetch_src((const void *)srcp); */ |
| |
| if (len < THRESHOLD) |
| goto byte_copy; |
| |
| /* Check alignment */ |
| t1 = (src ^ dst); |
| if (unlikely(t1 & (sizeof(unsigned int)-1))) |
| goto unaligned_copy; |
| |
| /* src and dst have same alignment. */ |
| |
| /* Copy bytes till we are double-aligned. */ |
| t2 = src & (sizeof(unsigned int) - 1); |
| if (unlikely(t2 != 0)) { |
| t2 = sizeof(unsigned int) - t2; |
| while (t2 && len) { |
| load1(t3, pmc_done); |
| len--; |
| store1(t3, pmc_done); |
| t2--; |
| } |
| } |
| |
| while (len >= 8*sizeof(unsigned int)) { |
| register unsigned int r1,r2,r3,r4,r5,r6,r7,r8; |
| /* prefetch_src((char *)src + L1_CACHE_BYTES); */ |
| load4(r1, pmc_done); |
| load4(r2, pmc_byte4); // bytecopy src-4 |
| load4(r3, pmc_byte8); // bytecopy src-8 |
| load4(r4, pmc_byte12); // bytecopy src-12 |
| store4(r1, pmc_done); |
| store4(r2, pmc_done); |
| store4(r3, pmc_done); |
| store4(r4, pmc_done); |
| |
| load4(r5, pmc_done); |
| load4(r6, pmc_byte4); // bytecopy src-4 |
| load4(r7, pmc_byte8); // bytecopy src-8 |
| load4(r8, pmc_byte12); // bytecopy src-12 |
| store4(r5, pmc_done); |
| store4(r6, pmc_done); |
| store4(r7, pmc_done); |
| store4(r8, pmc_done); |
| len -= 8*sizeof(unsigned int); |
| } |
| |
| while (len >= 4*sizeof(unsigned int)) { |
| register unsigned int r1,r2,r3,r4; |
| load4(r1, pmc_done); |
| load4(r2, pmc_byte4); // bytecopy src-4 |
| load4(r3, pmc_byte8); // bytecopy src-8 |
| load4(r4, pmc_byte12); // bytecopy src-12 |
| store4(r1, pmc_done); |
| store4(r2, pmc_done); |
| store4(r3, pmc_done); |
| store4(r4, pmc_done); |
| len -= 4*sizeof(unsigned int); |
| } |
| |
| byte_copy: |
| while (len) { |
| load1(t3, pmc_done); |
| store1(t3, pmc_done); |
| len--; |
| } |
| |
| return 0; |
| |
| unaligned_copy: |
| if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) { |
| t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1)); |
| while (t2) { |
| load1(t3, pmc_done); |
| store1(t3, pmc_done); |
| len--; |
| t2--; |
| } |
| } |
| |
| ret = copy_dstaligned(dst, src, len / sizeof(unsigned int)); |
| if (unlikely(ret)) { |
| if (ret & 1) |
| return len - (ret - 1 - dst); |
| src += ret - dst; |
| len -= ret - dst; |
| dst = ret; |
| goto byte_copy; |
| } |
| |
| src += (len & -sizeof(unsigned int)); |
| dst += (len & -sizeof(unsigned int)); |
| len %= sizeof(unsigned int); |
| |
| preserve_branch(handle_error1); |
| preserve_branch(handle_error2); |
| preserve_branch(handle_error3); |
| preserve_branch(handle_error4); |
| goto byte_copy; |
| |
| handle_error1: |
| __asm__ __volatile__ ("pmc_done:\n"); |
| return end - dst; |
| handle_error2: |
| __asm__ __volatile__ ("pmc_byte4:\n"); |
| src -= 4; |
| goto byte_copy; |
| handle_error3: |
| __asm__ __volatile__ ("pmc_byte8:\n"); |
| src -= 8; |
| goto byte_copy; |
| handle_error4: |
| __asm__ __volatile__ ("pmc_byte12:\n"); |
| src -= 12; |
| goto byte_copy; |
| } |
| |
| #ifdef __KERNEL__ |
| unsigned long raw_copy_to_user(void __user *dst, const void *src, |
| unsigned long len) |
| { |
| mtsp(get_kernel_space(), 1); |
| mtsp(get_user_space(), 2); |
| return pa_memcpy((void __force *)dst, src, len); |
| } |
| EXPORT_SYMBOL(raw_copy_to_user); |
| |
| unsigned long raw_copy_from_user(void *dst, const void __user *src, |
| unsigned long len) |
| { |
| mtsp(get_user_space(), 1); |
| mtsp(get_kernel_space(), 2); |
| return pa_memcpy(dst, (void __force *)src, len); |
| } |
| EXPORT_SYMBOL(raw_copy_from_user); |
| |
| unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long len) |
| { |
| mtsp(get_user_space(), 1); |
| mtsp(get_user_space(), 2); |
| return pa_memcpy((void __force *)dst, (void __force *)src, len); |
| } |
| |
| |
| void * memcpy(void * dst,const void *src, size_t count) |
| { |
| mtsp(get_kernel_space(), 1); |
| mtsp(get_kernel_space(), 2); |
| pa_memcpy(dst, src, count); |
| return dst; |
| } |
| |
| EXPORT_SYMBOL(raw_copy_in_user); |
| EXPORT_SYMBOL(memcpy); |
| |
| long probe_kernel_read(void *dst, const void *src, size_t size) |
| { |
| unsigned long addr = (unsigned long)src; |
| |
| if (addr < PAGE_SIZE) |
| return -EFAULT; |
| |
| /* check for I/O space F_EXTEND(0xfff00000) access as well? */ |
| |
| return __probe_kernel_read(dst, src, size); |
| } |
| |
| #endif |