2 # Copyright 2002 Free Software Foundation, Inc.
4 # This file is part of GNU Radio
6 # GNU Radio is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3, or (at your option)
11 # GNU Radio is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with GNU Radio; see the file COPYING. If not, write to
18 # the Free Software Foundation, Inc., 51 Franklin Street,
19 # Boston, MA 02110-1301, USA.
23 # input and taps are guarenteed to be 16 byte aligned.
24 # n_2_ccomplex_blocks is != 0
27 # ccomplex_dotprod_generic (const float *input,
28 # const float *taps, unsigned n_2_ccomplex_blocks, float *result)
37 # sum0 += input[0] * taps[0] - input[1] * taps[1];
38 # sum1 += input[0] * taps[1] + input[1] * taps[0];
39 # sum2 += input[2] * taps[2] - input[3] * taps[3];
40 # sum3 += input[2] * taps[3] + input[3] * taps[2];
45 # } while (--n_2_ccomplex_blocks != 0);
48 # result[0] = sum0 + sum2;
49 # result[1] = sum1 + sum3;
53 # TODO: prefetch and better scheduling
57 .file "ccomplex_dotprod_3dnow.S"
61 .globl GLOB_SYMB(ccomplex_dotprod_3dnow)
62 DEF_FUNC_HEAD(ccomplex_dotprod_3dnow)
63 GLOB_SYMB(ccomplex_dotprod_3dnow):
66 movl 8(%ebp), %eax # input
67 movl 12(%ebp), %edx # taps
68 movl 16(%ebp), %ecx # n_2_ccomplex_blocks
72 pxor %mm6, %mm6 # mm6 = 0 0
76 pxor %mm7, %mm7 # mm7 = 0 0
82 shrl $1, %ecx # ecx = n_2_ccomplex_blocks / 2
90 # something like ?? cycles / loop
96 # complex prod: C += A * B, w/ temp Z, mmPN=$80000000
101 # # 3DNow! replacement for: pswapd %mmA, %mmZ
102 # # TODO: optimize the punpckhdq
104 # punpckhdq %mmZ, %mmZ
105 # punpckldq %mmA, %mmZ
110 # # 3DNow! replacement for: pfpnacc %mmB, %mmA
117 # A=mm0, B=mm2, Z=mm4
118 # A'=mm1, B'=mm3, Z'=mm5
170 # We've handled the bulk of multiplies up to here.
171 # Let's see if original n_2_ccomplex_blocks was odd.
172 # If so, we've got 2 more taps to do.
174 movl 16(%ebp), %ecx # n_2_ccomplex_blocks
178 # The count was odd, do 2 more taps.
179 # Note that we've already got mm0/mm2 & mm1/mm3 preloaded
180 # from the main loop.
198 # mmNP: negative inversor
200 pcmpeqd %mm0, %mm0 # set all bits to 1
201 psllq $63, %mm0 # keep only hsb
206 movl 20(%ebp), %eax # result
214 FUNC_TAIL(ccomplex_dotprod_3dnow)
215 .ident "Hand coded x86 3DNow! assembly"