X-Git-Url: https://git.gag.com/?a=blobdiff_plain;f=gnuradio-core%2Fsrc%2Flib%2Ffilter%2Fcomplex_dotprod_sse64.S;h=2e945654722f70fc205ff6a519a006c336dcdc39;hb=refs%2Ftags%2Fupstream%2F3.2.2;hp=acbff61aef570fd457721d14ecdf9b1868bc7d6f;hpb=09a1e803a9e6587c78d20cdf16891e5295874668;p=debian%2Fgnuradio diff --git a/gnuradio-core/src/lib/filter/complex_dotprod_sse64.S b/gnuradio-core/src/lib/filter/complex_dotprod_sse64.S index acbff61a..2e945654 100644 --- a/gnuradio-core/src/lib/filter/complex_dotprod_sse64.S +++ b/gnuradio-core/src/lib/filter/complex_dotprod_sse64.S @@ -80,7 +80,7 @@ GLOB_SYMB(complex_dotprod_sse): jmp .L1_test .p2align 4 -.loop1: +.Loop1: pxor %mm0, %mm0 punpcklwd 0(%rdi), %mm0 @@ -94,7 +94,7 @@ GLOB_SYMB(complex_dotprod_sse): addps %xmm0, %xmm4 .L1_test: dec %rax - jge .loop1 + jge .Loop1 # set up for primary loop which is unrolled 4 times @@ -103,7 +103,7 @@ GLOB_SYMB(complex_dotprod_sse): shr $2, %rdx # n_2_complex_blocks / 4 movaps %xmm5, %xmm7 - je .cleanup # if zero, take short path + je .Lcleanup # if zero, take short path # finish setup and loop priming @@ -127,7 +127,7 @@ GLOB_SYMB(complex_dotprod_sse): # hence enter loop at top .p2align 4 -.loop2: +.Loop2: mulps (%rsi), %xmm0 addps %xmm2, %xmm6 @@ -167,7 +167,7 @@ GLOB_SYMB(complex_dotprod_sse): add $0x40, %rsi add $0x10, %rdi dec %rdx - jne .loop2 + jne .Loop2 # OK, now we've done with all the multiplies, but # we still need to handle the unaccumulated @@ -186,7 +186,7 @@ GLOB_SYMB(complex_dotprod_sse): # At this point, xmm4 contains 2x2 partial sums. We need # to compute a "horizontal complex add" across xmm4. -.cleanup: # xmm4 = r1 i2 r3 i4 +.Lcleanup: # xmm4 = r1 i2 r3 i4 movhlps %xmm4, %xmm0 # xmm0 = ?? ?? r1 r2 addps %xmm4, %xmm0 # xmm0 = ?? ?? r1+r3 i2+i4 movlps %xmm0, (%rcx) # store low 2x32 bits (complex) to memory @@ -196,3 +196,7 @@ GLOB_SYMB(complex_dotprod_sse): FUNC_TAIL(complex_dotprod_sse) .ident "Hand coded x86_64 SSE assembly" + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif