2 # Copyright 2002,2005 Free Software Foundation, Inc.
4 # This file is part of GNU Radio
6 # GNU Radio is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3, or (at your option)
11 # GNU Radio is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with GNU Radio; see the file COPYING. If not, write to
18 # the Free Software Foundation, Inc., 51 Franklin Street,
19 # Boston, MA 02110-1301, USA.
23 # input and taps are guarenteed to be 16 byte aligned.
24 # n_2_ccomplex_blocks is != 0
27 # ccomplex_dotprod_generic (const float *input,
28 # const float *taps, unsigned n_2_ccomplex_blocks, float *result)
37 # sum0 += input[0] * taps[0] - input[1] * taps[1];
38 # sum1 += input[0] * taps[1] + input[1] * taps[0];
39 # sum2 += input[2] * taps[2] - input[3] * taps[3];
40 # sum3 += input[2] * taps[3] + input[3] * taps[2];
45 # } while (--n_2_ccomplex_blocks != 0);
48 # result[0] = sum0 + sum2;
49 # result[1] = sum1 + sum3;
53 # TODO: prefetch and better scheduling
58 .file "ccomplex_dotprod_sse64.S"
62 .globl GLOB_SYMB(ccomplex_dotprod_sse)
63 DEF_FUNC_HEAD(ccomplex_dotprod_sse)
64 GLOB_SYMB(ccomplex_dotprod_sse):
66 # intput: rdi, taps: rsi, n_2_ccomplex_blocks: rdx, result: rcx
70 xorps %xmm6, %xmm6 # zero accumulators
74 xorps %xmm7, %xmm7 # zero accumulators
78 shr $1, %rax # rax = n_2_ccomplex_blocks / 2
84 # something like ?? cycles / loop
90 # complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000
92 # movaps (%rdi), %xmmA
93 # movaps (%rsi), %xmmB
96 # shufps $0xb1, %xmmZ, %xmmZ # swap internals
101 # # SSE replacement for: pfpnacc %xmmB, %xmmA
102 # xorps %xmmPN, %xmmA
103 # movaps %xmmA, %xmmZ
104 # unpcklps %xmmB, %xmmA
105 # unpckhps %xmmB, %xmmZ
106 # movaps %xmmZ, %xmmY
107 # shufps $0x44, %xmmA, %xmmZ # b01000100
108 # shufps $0xee, %xmmY, %xmmA # b11101110
113 # A=xmm0, B=xmm2, Z=xmm4
114 # A'=xmm1, B'=xmm3, Z'=xmm5
116 movaps 16(%rdi), %xmm1
121 shufps $0xb1, %xmm4, %xmm4 # swap internals
122 movaps 16(%rsi), %xmm3
126 shufps $0xb1, %xmm5, %xmm5 # swap internals
129 movaps 32(%rdi), %xmm0
135 movaps 32(%rsi), %xmm2
146 # We've handled the bulk of multiplies up to here.
147 # Let's sse if original n_2_ccomplex_blocks was odd.
148 # If so, we've got 2 more taps to do.
153 # The count was odd, do 2 more taps.
154 # Note that we've already got mm0/mm2 preloaded
155 # from the main loop.
159 shufps $0xb1, %xmm4, %xmm4 # swap internals
168 movl $0x80000000, -8(%rsp)
169 movss -8(%rsp), %xmm1
170 shufps $0x11, %xmm1, %xmm1 # b00010001 # 0 -0 0 -0
176 unpcklps %xmm7, %xmm6
177 unpckhps %xmm7, %xmm2
179 shufps $0x44, %xmm6, %xmm2 # b01000100
180 shufps $0xee, %xmm3, %xmm6 # b11101110
184 movhlps %xmm6, %xmm4 # xmm4 = r3 i4 ?? ??
185 addps %xmm4, %xmm6 # xmm6 = r1+r3 i2+i4 ?? ??
186 movlps %xmm6, (%rcx) # store low 2x32 bits (complex) to memory
190 FUNC_TAIL(ccomplex_dotprod_sse)
191 .ident "Hand coded x86_64 SSE assembly"