2 # Copyright 2002 Free Software Foundation, Inc.
4 # This file is part of GNU Radio
6 # GNU Radio is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3, or (at your option)
11 # GNU Radio is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with GNU Radio; see the file COPYING. If not, write to
18 # the Free Software Foundation, Inc., 51 Franklin Street,
19 # Boston, MA 02110-1301, USA.
23 # input and taps are guarenteed to be 16 byte aligned.
24 # n_2_complex_blocks is != 0
27 # complex_dotprod_generic (const short *input,
28 # const float *taps, unsigned n_2_complex_blocks, float *result)
37 # sum0 += input[0] * taps[0];
38 # sum1 += input[0] * taps[1];
39 # sum2 += input[1] * taps[2];
40 # sum3 += input[1] * taps[3];
45 # } while (--n_2_complex_blocks != 0);
48 # result[0] = sum0 + sum2;
49 # result[1] = sum1 + sum3;
55 .file "complex_dotprod_3dnowext.S"
59 .globl GLOB_SYMB(complex_dotprod_3dnowext)
60 DEF_FUNC_HEAD(complex_dotprod_3dnowext)
61 GLOB_SYMB(complex_dotprod_3dnowext):
64 movl 8(%ebp), %eax # input
65 movl 12(%ebp), %edx # taps
70 pxor %mm4, %mm4 # mm4 = 0 0
71 pxor %mm5, %mm5 # mm5 = 0 0
72 pxor %mm6, %mm6 # mm6 = 0 0
73 pxor %mm7, %mm7 # mm7 = 0 0
76 shrl $1, %ecx # ecx = n_2_complex_blocks / 2
79 pshufw $0x55, %mm0, %mm1 # b01010101
92 # something like ?? cycles / loop
100 pshufw $0, 4(%eax), %mm2
106 pshufw $0x55, 4(%eax), %mm3 # b01010101
112 pshufw $0, 8(%eax), %mm0
117 pshufw $0x55, 8(%eax), %mm1 # b01010101
130 # We've handled the bulk of multiplies up to here.
131 # Now accumulate the final two additions and see if original
132 # n_2_complex_blocks was odd. If so, we've got 2 more
141 # The count was odd, do 2 more taps.
142 # Note that we've already got mm0 and mm1 preloaded
143 # from the main loop.
151 # at this point mm4, mm5, mm6 and mm7 contain partial sums
156 movl 20(%ebp), %eax # result
165 FUNC_TAIL(complex_dotprod_3dnowext)
166 .ident "Hand coded x86 3DNow!Ext assembly"
169 #if defined(__linux__) && defined(__ELF__)
170 .section .note.GNU-stack,"",%progbits