# # Copyright 2002,2005 Free Software Foundation, Inc. # # This file is part of GNU Radio # # GNU Radio is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # GNU Radio is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with GNU Radio; see the file COPYING. If not, write to # the Free Software Foundation, Inc., 51 Franklin Street, # Boston, MA 02110-1301, USA. # # input and taps are guarenteed to be 16 byte aligned. # n_4_float_blocks is != 0 # # # float # float_dotprod_generic (const float *input, # const float *taps, unsigned n_4_float_blocks) # { # float sum0 = 0; # float sum1 = 0; # float sum2 = 0; # float sum3 = 0; # # do { # # sum0 += input[0] * taps[0]; # sum1 += input[1] * taps[1]; # sum2 += input[2] * taps[2]; # sum3 += input[3] * taps[3]; # # input += 4; # taps += 4; # # } while (--n_4_float_blocks != 0); # # # return sum0 + sum1 + sum2 + sum3; # } # #include "assembly.h" .file "float_dotprod_sse64.S" .version "01.01" .text .p2align 4 .globl GLOB_SYMB(float_dotprod_sse) DEF_FUNC_HEAD(float_dotprod_sse) GLOB_SYMB(float_dotprod_sse): # intput: rdi, taps: rsi, n_2_ccomplex_blocks: rdx mov %rdx, %rax # xmm0 xmm1 xmm2 xmm3 are used to hold taps and the result of mults # xmm4 xmm5 xmm6 xmm7 are used to hold the accumulated results xorps %xmm4, %xmm4 # zero two accumulators xorps %xmm5, %xmm5 # xmm5 holds zero for use below # first handle any non-zero remainder of (n_4_float_blocks % 4) and $0x3, %rax jmp .L1_test .p2align 4 .Loop1: movaps (%rsi), %xmm0 mulps (%rdi), %xmm0 add $0x10, %rdi add $0x10, %rsi addps %xmm0, %xmm4 .L1_test: dec %rax jge .Loop1 # set up for primary loop which is unrolled 4 times movaps %xmm5, %xmm6 # zero remaining accumulators movaps %xmm5, %xmm7 shr $2, %rdx # n_4_float_blocks / 4 je .Lcleanup # if zero, take short path # finish setup and loop priming movaps 0x00(%rsi), %xmm0 movaps %xmm5, %xmm2 movaps 0x10(%rsi), %xmm1 movaps %xmm5, %xmm3 # we know rdx is not zero, we checked above, # hence enter loop at top .p2align 4 .Loop2: mulps (%rdi), %xmm0 addps %xmm2, %xmm6 movaps 0x20(%rsi), %xmm2 mulps 0x10(%rdi), %xmm1 addps %xmm3, %xmm7 movaps 0x30(%rsi), %xmm3 mulps 0x20(%rdi), %xmm2 addps %xmm0, %xmm4 movaps 0x40(%rsi), %xmm0 mulps 0x30(%rdi), %xmm3 addps %xmm1, %xmm5 movaps 0x50(%rsi), %xmm1 add $0x40, %rdi add $0x40, %rsi dec %rdx jne .Loop2 # OK, now we've done with all the multiplies, but # we still need to handle the unaccumulated # products in xmm2 and xmm3 addps %xmm2, %xmm6 addps %xmm3, %xmm7 # now we want to add all accumulators into xmm4 addps %xmm5, %xmm4 addps %xmm6, %xmm7 addps %xmm7, %xmm4 # At this point, xmm4 contains 4 partial sums. We need # to compute a "horizontal add" across xmm4. # This is a fairly nasty operation... .Lcleanup: # xmm4 = d1 d2 d3 d4 xorps %xmm0, %xmm0 # xmm0 = 0 0 0 0 (may be unnecessary) movhlps %xmm4, %xmm0 # xmm0 = 0 0 d1 d2 addps %xmm4, %xmm0 # xmm0 = d1 d2 d1+d3 d2+d4 movaps %xmm0, %xmm1 # xmm1 = d1 d2 d1+d3 d2+d4 shufps $0xE1, %xmm4, %xmm1 # xmm1 = d1 d2 d2+d4 d1+d3 addss %xmm1, %xmm0 # xmm1 = d1 d2 d1+d3 d1+d2+d3+d4 retq FUNC_TAIL(float_dotprod_sse) .ident "Hand coded x86_64 SSE assembly" #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif