From fdf4289679fd41d76553ce224750e9737cd80eea Mon Sep 17 00:00:00 2001 From: "Ma, Ling" Date: Mon, 23 Aug 2010 14:11:12 -0700 Subject: x86, mem: Don't implement forward memmove() as memcpy() memmove() allow source and destination address to be overlap, but there is no such limitation for memcpy(). Therefore, explicitly implement memmove() in both the forwards and backward directions, to give us the ability to optimize memcpy(). Signed-off-by: Ma Ling LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/lib/memcpy_32.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'arch/x86/lib/memcpy_32.c') diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index 5415a9d06f53..be424dfcf365 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -25,19 +25,35 @@ void *memmove(void *dest, const void *src, size_t n) int d0, d1, d2; if (dest < src) { - memcpy(dest, src, n); + if ((dest + n) < src) + return memcpy(dest, src, n); + else + __asm__ __volatile__( + "rep\n\t" + "movsb\n\t" + : "=&c" (d0), "=&S" (d1), "=&D" (d2) + :"0" (n), + "1" (src), + "2" (dest) + :"memory"); + } else { - __asm__ __volatile__( - "std\n\t" - "rep\n\t" - "movsb\n\t" - "cld" - : "=&c" (d0), "=&S" (d1), "=&D" (d2) - :"0" (n), - "1" (n-1+src), - "2" (n-1+dest) - :"memory"); + + if((src + count) < dest) + return memcpy(dest, src, count); + else + __asm__ __volatile__( + "std\n\t" + "rep\n\t" + "movsb\n\t" + "cld" + : "=&c" (d0), "=&S" (d1), "=&D" (d2) + :"0" (n), + "1" (n-1+src), + "2" (n-1+dest) + :"memory"); } + return dest; } EXPORT_SYMBOL(memmove); -- cgit v1.2.3 From 59daa706fbec745684702741b9f5373142dd9fdc Mon Sep 17 00:00:00 2001 From: Ma Ling Date: Tue, 29 Jun 2010 03:24:25 +0800 Subject: x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/lib/memcpy_32.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/lib/memcpy_32.c') diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index be424dfcf365..81130d477ee2 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -36,11 +36,9 @@ void *memmove(void *dest, const void *src, size_t n) "1" (src), "2" (dest) :"memory"); - } else { - - if((src + count) < dest) - return memcpy(dest, src, count); + if((src + n) < dest) + return memcpy(dest, src, n); else __asm__ __volatile__( "std\n\t" -- cgit v1.2.3 From 3b4b682becdfa9f42321aa024d5cc84f71f06d8c Mon Sep 17 00:00:00 2001 From: Ma Ling Date: Fri, 17 Sep 2010 03:12:40 +0800 Subject: x86, mem: Optimize memmove for small size and unaligned cases movs instruction will combine data to accelerate moving data, however we need to concern two cases about it. 1. movs instruction need long lantency to startup, so here we use general mov instruction to copy data. 2. movs instruction is not good for unaligned case, even if src offset is 0x10, dest offset is 0x0, we avoid and handle the case by general mov instruction. Signed-off-by: Ma Ling LKML-Reference: <1284664360-6138-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/lib/memcpy_32.c | 213 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 182 insertions(+), 31 deletions(-) (limited to 'arch/x86/lib/memcpy_32.c') diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index 81130d477ee2..b908a59eccf5 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -22,36 +22,187 @@ EXPORT_SYMBOL(memset); void *memmove(void *dest, const void *src, size_t n) { - int d0, d1, d2; - - if (dest < src) { - if ((dest + n) < src) - return memcpy(dest, src, n); - else - __asm__ __volatile__( - "rep\n\t" - "movsb\n\t" - : "=&c" (d0), "=&S" (d1), "=&D" (d2) - :"0" (n), - "1" (src), - "2" (dest) - :"memory"); - } else { - if((src + n) < dest) - return memcpy(dest, src, n); - else - __asm__ __volatile__( - "std\n\t" - "rep\n\t" - "movsb\n\t" - "cld" - : "=&c" (d0), "=&S" (d1), "=&D" (d2) - :"0" (n), - "1" (n-1+src), - "2" (n-1+dest) - :"memory"); - } - - return dest; + int d0,d1,d2,d3,d4,d5; + char *ret = dest; + + __asm__ __volatile__( + /* Handle more 16bytes in loop */ + "cmp $0x10, %0\n\t" + "jb 1f\n\t" + + /* Decide forward/backward copy mode */ + "cmp %2, %1\n\t" + "jb 2f\n\t" + + /* + * movs instruction have many startup latency + * so we handle small size by general register. + */ + "cmp $680, %0\n\t" + "jb 3f\n\t" + /* + * movs instruction is only good for aligned case. + */ + "mov %1, %3\n\t" + "xor %2, %3\n\t" + "and $0xff, %3\n\t" + "jz 4f\n\t" + "3:\n\t" + "sub $0x10, %0\n\t" + + /* + * We gobble 16byts forward in each loop. + */ + "3:\n\t" + "sub $0x10, %0\n\t" + "mov 0*4(%1), %3\n\t" + "mov 1*4(%1), %4\n\t" + "mov %3, 0*4(%2)\n\t" + "mov %4, 1*4(%2)\n\t" + "mov 2*4(%1), %3\n\t" + "mov 3*4(%1), %4\n\t" + "mov %3, 2*4(%2)\n\t" + "mov %4, 3*4(%2)\n\t" + "lea 0x10(%1), %1\n\t" + "lea 0x10(%2), %2\n\t" + "jae 3b\n\t" + "add $0x10, %0\n\t" + "jmp 1f\n\t" + + /* + * Handle data forward by movs. + */ + ".p2align 4\n\t" + "4:\n\t" + "mov -4(%1, %0), %3\n\t" + "lea -4(%2, %0), %4\n\t" + "shr $2, %0\n\t" + "rep movsl\n\t" + "mov %3, (%4)\n\t" + "jmp 11f\n\t" + /* + * Handle data backward by movs. + */ + ".p2align 4\n\t" + "6:\n\t" + "mov (%1), %3\n\t" + "mov %2, %4\n\t" + "lea -4(%1, %0), %1\n\t" + "lea -4(%2, %0), %2\n\t" + "shr $2, %0\n\t" + "std\n\t" + "rep movsl\n\t" + "mov %3,(%4)\n\t" + "cld\n\t" + "jmp 11f\n\t" + + /* + * Start to prepare for backward copy. + */ + ".p2align 4\n\t" + "2:\n\t" + "cmp $680, %0\n\t" + "jb 5f\n\t" + "mov %1, %3\n\t" + "xor %2, %3\n\t" + "and $0xff, %3\n\t" + "jz 6b\n\t" + + /* + * Calculate copy position to tail. + */ + "5:\n\t" + "add %0, %1\n\t" + "add %0, %2\n\t" + "sub $0x10, %0\n\t" + + /* + * We gobble 16byts backward in each loop. + */ + "7:\n\t" + "sub $0x10, %0\n\t" + + "mov -1*4(%1), %3\n\t" + "mov -2*4(%1), %4\n\t" + "mov %3, -1*4(%2)\n\t" + "mov %4, -2*4(%2)\n\t" + "mov -3*4(%1), %3\n\t" + "mov -4*4(%1), %4\n\t" + "mov %3, -3*4(%2)\n\t" + "mov %4, -4*4(%2)\n\t" + "lea -0x10(%1), %1\n\t" + "lea -0x10(%2), %2\n\t" + "jae 7b\n\t" + /* + * Calculate copy position to head. + */ + "add $0x10, %0\n\t" + "sub %0, %1\n\t" + "sub %0, %2\n\t" + + /* + * Move data from 8 bytes to 15 bytes. + */ + ".p2align 4\n\t" + "1:\n\t" + "cmp $8, %0\n\t" + "jb 8f\n\t" + "mov 0*4(%1), %3\n\t" + "mov 1*4(%1), %4\n\t" + "mov -2*4(%1, %0), %5\n\t" + "mov -1*4(%1, %0), %1\n\t" + + "mov %3, 0*4(%2)\n\t" + "mov %4, 1*4(%2)\n\t" + "mov %5, -2*4(%2, %0)\n\t" + "mov %1, -1*4(%2, %0)\n\t" + "jmp 11f\n\t" + + /* + * Move data from 4 bytes to 7 bytes. + */ + ".p2align 4\n\t" + "8:\n\t" + "cmp $4, %0\n\t" + "jb 9f\n\t" + "mov 0*4(%1), %3\n\t" + "mov -1*4(%1, %0), %4\n\t" + "mov %3, 0*4(%2)\n\t" + "mov %4, -1*4(%2, %0)\n\t" + "jmp 11f\n\t" + + /* + * Move data from 2 bytes to 3 bytes. + */ + ".p2align 4\n\t" + "9:\n\t" + "cmp $2, %0\n\t" + "jb 10f\n\t" + "movw 0*2(%1), %%dx\n\t" + "movw -1*2(%1, %0), %%bx\n\t" + "movw %%dx, 0*2(%2)\n\t" + "movw %%bx, -1*2(%2, %0)\n\t" + "jmp 11f\n\t" + + /* + * Move data for 1 byte. + */ + ".p2align 4\n\t" + "10:\n\t" + "cmp $1, %0\n\t" + "jb 11f\n\t" + "movb (%1), %%cl\n\t" + "movb %%cl, (%2)\n\t" + ".p2align 4\n\t" + "11:" + : "=&c" (d0), "=&S" (d1), "=&D" (d2), + "=r" (d3),"=r" (d4), "=r"(d5) + :"0" (n), + "1" (src), + "2" (dest) + :"memory"); + + return ret; + } EXPORT_SYMBOL(memmove); -- cgit v1.2.3