2 # Copyright 2002,2005 Free Software Foundation, Inc.
4 # This file is part of GNU Radio
6 # GNU Radio is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3, or (at your option)
11 # GNU Radio is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with GNU Radio; see the file COPYING. If not, write to
18 # the Free Software Foundation, Inc., 51 Franklin Street,
19 # Boston, MA 02110-1301, USA.
22 # SIMD MMX dot product
23 # Equivalent to the following C code:
24 # long dotprod(signed short *a,signed short *b,int cnt)
32 # a and b should also be 64-bit aligned, or speed will suffer greatly
33 # Copyright 1999, Phil Karn KA9Q
34 # May be used under the terms of the GNU public license
39 .file "short_dotprod_mmx64.S"
43 .globl GLOB_SYMB(short_dotprod_mmx)
44 DEF_FUNC_HEAD(short_dotprod_mmx)
45 GLOB_SYMB(short_dotprod_mmx):
47 # a: rdi, b: rsi, cnt: rdx
49 pxor %mm0,%mm0 # clear running sum (in two 32-bit halves)
51 # MMX dot product loop unrolled 4 times, crunching 16 terms per loop
53 .Loop1mmx: sub $4,%rdx
56 movq (%rdi),%mm1 # mm1 = a[3],a[2],a[1],a[0]
57 pmaddwd (%rsi),%mm1 # mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0]
79 # MMX dot product loop, not unrolled, crunching 4 terms per loop
80 # This could be redone as Duff's Device on the unrolled loop above
92 movd %mm0,%edx # right-hand word to edx
93 punpckhdq %mm0,%mm0 # left-hand word to right side of %mm0
95 addl %edx,%eax # running sum now in %eax
100 FUNC_TAIL(short_dotprod_mmx)
101 .ident "Hand coded x86_64 MMX assembly"