Prev: x86: enlightenment for ticket spin locks - base implementation
Next: OMAP2:OMAPFB: Fix error path
From: Ma, Ling on 30 Jun 2010 05:10 Hi Ingo We extract some compared results by attachment micro-benchmarks on Corei7. (gcc -O2 -o memcpy-kernel memcpy-kernel.c ) LAT: Len 127, alignment 4/16: improvement: 2X LAT: Len 127, alignment 0/16: improvement: 2X LAT: Len 1024, alignment 4/16: improvement: 1.5X LAT: Len 1024, alignment 0/ 0: no change LAT: Len 4096, alignment 4/16: improvement :1.6X LAT: Len 4096, alignment 0/ 8: improvement:1.37X LAT: Len 8192, alignment 16/ 0: no change LAT: Len 8192, alignment 0/16: improvement 1.45X Any comments from you ? Thanks Ling > -----Original Message----- > From: Ma, Ling > Sent: Tuesday, June 29, 2010 3:24 AM > To: mingo(a)elte.hu > Cc: hpa(a)zytor.com; tglx(a)linutronix.de; linux-kernel(a)vger.kernel.org; Ma, > Ling > Subject: [PATCH RFC] [X86] Optimize memcpy by avoiding memory false > dependece > > From: Ma Ling <ling.ma(a)intel.com> > > All read operations after allocation stage can run speculatively, > all write operation will run in program order, and if addresses are > different read may run before older write operation, otherwise wait > until write commit. However CPU don't check each address bit, > so read could fail to recognize different address even they > are in different page.For example if rsi is 0xf004, rdi is 0xe008, > in following operation there will generate big performance latency. > 1. movq (%rsi), %rax > 2. movq %rax, (%rdi) > 3. movq 8(%rsi), %rax > 4. movq %rax, 8(%rdi) > > If %rsi and rdi were in really the same meory page, there are TRUE > read-after-write dependence because instruction 2 write 0x008 and > instruction 3 read 0x00c, the two address are overlap partially. > Actually there are in different page and no any issues, > but without checking each address bit CPU could think they are > in the same page, and instruction 3 have to wait for instruction 2 > to write data into cache from write buffer, then load data from cache, > the cost time read spent is equal to mfence instruction. We may avoid it by > tuning operation sequence as follow. > > 1. movq 8(%rsi), %rax > 2. movq %rax, 8(%rdi) > 3. movq (%rsi), %rax > 4. movq %rax, (%rdi) > > Instruction 3 read 0x004, instruction 2 write address 0x010, no any > dependence. > At last on Core2 we gain 1.83x speedup compared with original instruction > sequence. > In this patch we first handle small size(less 20bytes), then jump to > different copy mode. Based on our micro-benchmark small bytes from 1 to 127 > bytes, > we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on > Corei7. > (We use our micro-benchmark, and will do further test according to your > requirment) > > Thanks > Ling > > --- > arch/x86/lib/memcpy_64.S | 158 > ++++++++++++++++++++++++++++++---------------- > 1 files changed, 103 insertions(+), 55 deletions(-) > > diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S > index f82e884..5902438 100644 > --- a/arch/x86/lib/memcpy_64.S > +++ b/arch/x86/lib/memcpy_64.S > @@ -40,84 +40,132 @@ > ENTRY(__memcpy) > ENTRY(memcpy) > CFI_STARTPROC > + movq %rdi, %rax > > /* > - * Put the number of full 64-byte blocks into %ecx. > - * Tail portion is handled at the end: > + * Use 32bit CMP here to avoid long NOP padding. > */ > - movq %rdi, %rax > - movl %edx, %ecx > - shrl $6, %ecx > - jz .Lhandle_tail > + cmp $0x20, %edx > + jb .Lhandle_tail > > - .p2align 4 > -.Lloop_64: > /* > - * We decrement the loop index here - and the zero-flag is > - * checked at the end of the loop (instructions inbetween do > - * not change the zero flag): > + * We check whether memory false dependece could occur, > + * then jump to corresponding copy mode. > */ > - decl %ecx > + cmp %dil, %sil > + jl .Lcopy_backward > + subl $0x20, %edx > +.Lcopy_forward_loop: > + subq $0x20, %rdx > > /* > - * Move in blocks of 4x16 bytes: > + * Move in blocks of 4x8 bytes: > */ > - movq 0*8(%rsi), %r11 > - movq 1*8(%rsi), %r8 > - movq %r11, 0*8(%rdi) > - movq %r8, 1*8(%rdi) > - > - movq 2*8(%rsi), %r9 > - movq 3*8(%rsi), %r10 > - movq %r9, 2*8(%rdi) > - movq %r10, 3*8(%rdi) > - > - movq 4*8(%rsi), %r11 > - movq 5*8(%rsi), %r8 > - movq %r11, 4*8(%rdi) > - movq %r8, 5*8(%rdi) > - > - movq 6*8(%rsi), %r9 > - movq 7*8(%rsi), %r10 > - movq %r9, 6*8(%rdi) > - movq %r10, 7*8(%rdi) > - > - leaq 64(%rsi), %rsi > - leaq 64(%rdi), %rdi > - > - jnz .Lloop_64 > + movq 0*8(%rsi), %r8 > + movq 1*8(%rsi), %r9 > + movq 2*8(%rsi), %r10 > + movq 3*8(%rsi), %r11 > + leaq 4*8(%rsi), %rsi > + > + movq %r8, 0*8(%rdi) > + movq %r9, 1*8(%rdi) > + movq %r10, 2*8(%rdi) > + movq %r11, 3*8(%rdi) > + leaq 4*8(%rdi), %rdi > + jae .Lcopy_forward_loop > + addq $0x20, %rdx > + jmp .Lhandle_tail > + > +.Lcopy_backward: > + /* > + * Calculate copy position to tail. > + */ > + addq %rdx, %rsi > + addq %rdx, %rdi > + subq $0x20, %rdx > + /* > + * At most 3 ALU operations in one cycle, > + * so append NOPS in the same 16bytes trunk. > + */ > + .p2align 4 > +.Lcopy_backward_loop: > + subq $0x20, %rdx > + movq -1*8(%rsi), %r8 > + movq -2*8(%rsi), %r9 > + movq -3*8(%rsi), %r10 > + movq -4*8(%rsi), %r11 > + leaq -4*8(%rsi), %rsi > + movq %r8, -1*8(%rdi) > + movq %r9, -2*8(%rdi) > + movq %r10, -3*8(%rdi) > + movq %r11, -4*8(%rdi) > + leaq -4*8(%rdi), %rdi > + jae .Lcopy_backward_loop > > + /* > + * Calculate copy position to head. > + */ > + addq $0x20, %rdx > + subq %rdx, %rsi > + subq %rdx, %rdi > .Lhandle_tail: > - movl %edx, %ecx > - andl $63, %ecx > - shrl $3, %ecx > - jz .Lhandle_7 > + cmpq $16, %rdx > + jb .Lless_16bytes > > + /* > + * Move data from 16 bytes to 31 bytes. > + */ > + movq 0*8(%rsi), %r8 > + movq 1*8(%rsi), %r9 > + movq -2*8(%rsi, %rdx), %r10 > + movq -1*8(%rsi, %rdx), %r11 > + movq %r8, 0*8(%rdi) > + movq %r9, 1*8(%rdi) > + movq %r10, -2*8(%rdi, %rdx) > + movq %r11, -1*8(%rdi, %rdx) > + retq > .p2align 4 > -.Lloop_8: > - decl %ecx > - movq (%rsi), %r8 > - movq %r8, (%rdi) > - leaq 8(%rdi), %rdi > - leaq 8(%rsi), %rsi > - jnz .Lloop_8 > - > -.Lhandle_7: > - movl %edx, %ecx > - andl $7, %ecx > - jz .Lend > +.Lless_16bytes: > + cmpq $8, %rdx > + jb .Lless_8bytes > + /* > + * Move data from 8 bytes to 15 bytes. > + */ > + movq 0*8(%rsi), %r8 > + movq -1*8(%rsi, %rdx), %r9 > + movq %r8, 0*8(%rdi) > + movq %r9, -1*8(%rdi, %rdx) > + retq > + .p2align 4 > +.Lless_8bytes: > + cmpq $4, %rdx > + jb .Lless_3bytes > > + /* > + * Move data from 4 bytes to 7 bytes. > + */ > + movl (%rsi), %ecx > + movl -4(%rsi, %rdx), %r8d > + movl %ecx, (%rdi) > + movl %r8d, -4(%rdi, %rdx) > + retq > .p2align 4 > +.Lless_3bytes: > + cmpl $0, %edx > + je .Lend > + /* > + * Move data from 1 bytes to 3 bytes. > + */ > .Lloop_1: > movb (%rsi), %r8b > movb %r8b, (%rdi) > incq %rdi > incq %rsi > - decl %ecx > + decl %edx > jnz .Lloop_1 > > .Lend: > - ret > + retq > CFI_ENDPROC > ENDPROC(memcpy) > ENDPROC(__memcpy) > -- > 1.6.5.2
|
Pages: 1 Prev: x86: enlightenment for ticket spin locks - base implementation Next: OMAP2:OMAPFB: Fix error path |