2 # Copyright 2002 Free Software Foundation, Inc.
4 # This file is part of GNU Radio
6 # GNU Radio is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3, or (at your option)
11 # GNU Radio is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with GNU Radio; see the file COPYING. If not, write to
18 # the Free Software Foundation, Inc., 51 Franklin Street,
19 # Boston, MA 02110-1301, USA.
23 # input and taps are guarenteed to be 16 byte aligned.
24 # n_2_ccomplex_blocks is != 0
27 # ccomplex_dotprod_generic (const float *input,
28 # const float *taps, unsigned n_2_ccomplex_blocks, float *result)
37 # sum0 += input[0] * taps[0] - input[1] * taps[1];
38 # sum1 += input[0] * taps[1] + input[1] * taps[0];
39 # sum2 += input[2] * taps[2] - input[3] * taps[3];
40 # sum3 += input[2] * taps[3] + input[3] * taps[2];
45 # } while (--n_2_ccomplex_blocks != 0);
48 # result[0] = sum0 + sum2;
49 # result[1] = sum1 + sum3;
53 # TODO: prefetch and better scheduling
57 .file "ccomplex_dotprod_sse.S"
61 .globl GLOB_SYMB(ccomplex_dotprod_sse)
62 DEF_FUNC_HEAD(ccomplex_dotprod_sse)
63 GLOB_SYMB(ccomplex_dotprod_sse):
66 movl 8(%ebp), %eax # input
67 movl 12(%ebp), %edx # taps
68 movl 16(%ebp), %ecx # n_2_ccomplex_blocks
70 xorps %xmm6, %xmm6 # zero accumulators
74 xorps %xmm7, %xmm7 # zero accumulators
78 shrl $1, %ecx # ecx = n_2_ccomplex_blocks / 2
84 # something like ?? cycles / loop
90 # complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000
92 # movaps (%eax), %xmmA
93 # movaps (%edx), %xmmB
96 # shufps $0xb1, %xmmZ, %xmmZ # swap internals
101 # # SSE replacement for: pfpnacc %xmmB, %xmmA
102 # xorps %xmmPN, %xmmA
103 # movaps %xmmA, %xmmZ
104 # unpcklps %xmmB, %xmmA
105 # unpckhps %xmmB, %xmmZ
106 # movaps %xmmZ, %xmmY
107 # shufps $0x44, %xmmA, %xmmZ # b01000100
108 # shufps $0xee, %xmmY, %xmmA # b11101110
113 # A=xmm0, B=xmm2, Z=xmm4
114 # A'=xmm1, B'=xmm3, Z'=xmm5
116 movaps 16(%eax), %xmm1
121 shufps $0xb1, %xmm4, %xmm4 # swap internals
122 movaps 16(%edx), %xmm3
126 shufps $0xb1, %xmm5, %xmm5 # swap internals
129 movaps 32(%eax), %xmm0
135 movaps 32(%edx), %xmm2
146 # We've handled the bulk of multiplies up to here.
147 # Let's sse if original n_2_ccomplex_blocks was odd.
148 # If so, we've got 2 more taps to do.
150 movl 16(%ebp), %ecx # n_2_ccomplex_blocks
154 # The count was odd, do 2 more taps.
155 # Note that we've already got mm0/mm2 preloaded
156 # from the main loop.
160 shufps $0xb1, %xmm4, %xmm4 # swap internals
169 movl $0x80000000, 16(%ebp)
170 movss 16(%ebp), %xmm1
171 shufps $0x11, %xmm1, %xmm1 # b00010001 # 0 -0 0 -0
177 unpcklps %xmm7, %xmm6
178 unpckhps %xmm7, %xmm2
180 shufps $0x44, %xmm6, %xmm2 # b01000100
181 shufps $0xee, %xmm3, %xmm6 # b11101110
185 movl 20(%ebp), %eax # @result
186 movhlps %xmm6, %xmm4 # xmm4 = r3 i4 ?? ??
187 addps %xmm4, %xmm6 # xmm6 = r1+r3 i2+i4 ?? ??
188 movlps %xmm6, (%eax) # store low 2x32 bits (complex) to memory
193 FUNC_TAIL(ccomplex_dotprod_sse)
194 .ident "Hand coded x86 SSE assembly"
196 #if defined(__linux__) && defined(__ELF__)
197 .section .note.GNU-stack,"",%progbits