2 # Copyright 2002 Free Software Foundation, Inc.
4 # This file is part of GNU Radio
6 # GNU Radio is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3, or (at your option)
11 # GNU Radio is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with GNU Radio; see the file COPYING. If not, write to
18 # the Free Software Foundation, Inc., 51 Franklin Street,
19 # Boston, MA 02110-1301, USA.
23 # input and taps are guarenteed to be 16 byte aligned.
24 # n_2_complex_blocks is != 0
27 # complex_dotprod_generic (const short *input,
28 # const float *taps, unsigned n_2_complex_blocks, float *result)
37 # sum0 += input[0] * taps[0];
38 # sum1 += input[0] * taps[1];
39 # sum2 += input[1] * taps[2];
40 # sum3 += input[1] * taps[3];
45 # } while (--n_2_complex_blocks != 0);
48 # result[0] = sum0 + sum2;
49 # result[1] = sum1 + sum3;
53 # TODO: prefetch and better scheduling
58 .file "complex_dotprod_sse.S"
62 .globl GLOB_SYMB(complex_dotprod_sse)
63 DEF_FUNC_HEAD(complex_dotprod_sse)
64 GLOB_SYMB(complex_dotprod_sse):
67 movl 8(%ebp), %eax # input
68 movl 12(%ebp), %edx # taps
72 # xmm0 xmm1 xmm2 xmm3 are used to hold taps and the result of mults
73 # xmm4 xmm5 xmm6 xmm7 are used to hold the accumulated results
75 xorps %xmm4, %xmm4 # zero two accumulators
76 xorps %xmm5, %xmm5 # xmm5 holds zero for use below
78 # first handle any non-zero remainder of (n_2_complex_blocks % 4)
87 punpcklwd 0(%eax), %mm0
90 shufps $0x50, %xmm0, %xmm0
101 # set up for primary loop which is unrolled 4 times
104 movaps %xmm5, %xmm6 # zero remaining accumulators
107 shrl $2, %ecx # n_2_complex_blocks / 4
108 je .Lcleanup # if zero, take short path
110 # finish setup and loop priming
113 punpcklwd 0(%eax), %mm0
116 shufps $0x50, %xmm0, %xmm0
121 punpcklwd 4(%eax), %mm1
124 shufps $0x50, %xmm1, %xmm1
128 # we know ecx is not zero, we checked above,
129 # hence enter loop at top
137 punpcklwd 8(%eax), %mm2
140 shufps $0x50, %xmm2, %xmm2
142 mulps 0x10(%edx), %xmm1
146 punpcklwd 12(%eax), %mm3
149 shufps $0x50, %xmm3, %xmm3
151 mulps 0x20(%edx), %xmm2
155 punpcklwd 16(%eax), %mm0
158 shufps $0x50, %xmm0, %xmm0
160 mulps 0x30(%edx), %xmm3
164 punpcklwd 20(%eax), %mm1
167 shufps $0x50, %xmm1, %xmm1
174 # OK, now we've done with all the multiplies, but
175 # we still need to handle the unaccumulated
176 # products in xmm2 and xmm3
181 # now we want to add all accumulators into xmm4
188 # At this point, xmm4 contains 2x2 partial sums. We need
189 # to compute a "horizontal complex add" across xmm4.
191 .Lcleanup: # xmm4 = r1 i2 r3 i4
192 movl 20(%ebp), %eax # @result
193 movhlps %xmm4, %xmm0 # xmm0 = ?? ?? r1 r2
194 addps %xmm4, %xmm0 # xmm0 = ?? ?? r1+r3 i2+i4
195 movlps %xmm0, (%eax) # store low 2x32 bits (complex) to memory
201 FUNC_TAIL(complex_dotprod_sse)
202 .ident "Hand coded x86 SSE assembly"
204 #if defined(__linux__) && defined(__ELF__)
205 .section .note.GNU-stack,"",%progbits