2 # Copyright 2002 Free Software Foundation, Inc.
4 # This file is part of GNU Radio
6 # GNU Radio is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2, or (at your option)
11 # GNU Radio is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with GNU Radio; see the file COPYING. If not, write to
18 # the Free Software Foundation, Inc., 51 Franklin Street,
19 # Boston, MA 02110-1301, USA.
23 # input and taps are guarenteed to be 16 byte aligned.
24 # n_4_float_blocks is != 0
28 # float_dotprod_generic (const float *input,
29 # const float *taps, unsigned n_4_float_blocks)
38 # sum0 += input[0] * taps[0];
39 # sum1 += input[1] * taps[1];
40 # sum2 += input[2] * taps[2];
41 # sum3 += input[3] * taps[3];
46 # } while (--n_4_float_blocks != 0);
49 # return sum0 + sum1 + sum2 + sum3;
56 .file "float_dotprod_sse.S"
60 .globl GLOB_SYMB(float_dotprod_sse)
61 DEF_FUNC_HEAD(float_dotprod_sse)
62 GLOB_SYMB(float_dotprod_sse):
70 # xmm0 xmm1 xmm2 xmm3 are used to hold taps and the result of mults
71 # xmm4 xmm5 xmm6 xmm7 are used to hold the accumulated results
73 xorps %xmm4, %xmm4 # zero two accumulators
74 xorps %xmm5, %xmm5 # xmm5 holds zero for use below
76 # first handle any non-zero remainder of (n_4_float_blocks % 4)
93 # set up for primary loop which is unrolled 4 times
96 movaps %xmm5, %xmm6 # zero remaining accumulators
99 shrl $2, %ecx # n_4_float_blocks / 4
100 je .cleanup # if zero, take short path
102 # finish setup and loop priming
104 movaps 0x00(%eax), %xmm0
106 movaps 0x10(%eax), %xmm1
109 # we know ecx is not zero, we checked above,
110 # hence enter loop at top
116 movaps 0x20(%eax), %xmm2
118 mulps 0x10(%edx), %xmm1
120 movaps 0x30(%eax), %xmm3
122 mulps 0x20(%edx), %xmm2
124 movaps 0x40(%eax), %xmm0
126 mulps 0x30(%edx), %xmm3
128 movaps 0x50(%eax), %xmm1
135 # OK, now we've done with all the multiplies, but
136 # we still need to handle the unaccumulated
137 # products in xmm2 and xmm3
142 # now we want to add all accumulators into xmm4
149 # At this point, xmm4 contains 4 partial sums. We need
150 # to compute a "horizontal add" across xmm4.
151 # This is a fairly nasty operation...
153 .cleanup: # xmm4 = d1 d2 d3 d4
154 xorps %xmm0, %xmm0 # xmm0 = 0 0 0 0 (may be unnecessary)
155 movhlps %xmm4, %xmm0 # xmm0 = 0 0 d1 d2
156 addps %xmm4, %xmm0 # xmm0 = d1 d2 d1+d3 d2+d4
157 movaps %xmm0, %xmm1 # xmm1 = d1 d2 d1+d3 d2+d4
158 shufps $0xE1, %xmm4, %xmm1 # xmm1 = d1 d2 d2+d4 d1+d3
159 addss %xmm1, %xmm0 # xmm1 = d1 d2 d1+d3 d1+d2+d3+d4
160 movss %xmm0, 16(%ebp) # store low 32 bits (sum) to memory
161 flds 16(%ebp) # and load onto FPU stack for return
166 FUNC_TAIL(float_dotprod_sse)
167 .ident "Hand coded x86 SSE assembly"