git.gag.com Git - debian/gnuradio/blob - gnuradio-core/src/lib/filter/float_dotprod_3dnow64.S

   1 #
   2 # Copyright 2002,2005 Free Software Foundation, Inc.
   3 #
   4 # This file is part of GNU Radio
   5 #
   6 # GNU Radio is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2, or (at your option)
   9 # any later version.
  10 #
  11 # GNU Radio is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with GNU Radio; see the file COPYING.  If not, write to
  18 # the Free Software Foundation, Inc., 51 Franklin Street,
  19 # Boston, MA 02110-1301, USA.
  20 #
  21
  22
  23 # input and taps are guarenteed to be 16 byte aligned.
  24 # n_4_float_blocks is != 0
  25 #
  26 #
  27 #  float
  28 #  float_dotprod_generic (const float *input,
  29 #                         const float *taps, unsigned n_4_float_blocks)
  30 #  {
  31 #    float sum0 = 0;
  32 #    float sum1 = 0;
  33 #    float sum2 = 0;
  34 #    float sum3 = 0;
  35 #
  36 #    do {
  37 #
  38 #      sum0 += input[0] * taps[0];
  39 #      sum1 += input[1] * taps[1];
  40 #      sum2 += input[2] * taps[2];
  41 #      sum3 += input[3] * taps[3];
  42 #
  43 #      input += 4;
  44 #      taps += 4;
  45 #
  46 #    } while (--n_4_float_blocks != 0);
  47 #
  48 #
  49 #    return sum0 + sum1 + sum2 + sum3;
  50 #  }
  51 #
  52
  53 #include "assembly.h"
  54
  55
  56         .file   "float_dotprod_3dnow64.S"
  57         .version        "01.01"
  58 .text
  59         .p2align 4
  60 .globl GLOB_SYMB(float_dotprod_3dnow)
  61         DEF_FUNC_HEAD(float_dotprod_3dnow)
  62 GLOB_SYMB(float_dotprod_3dnow):
  63
  64         # intput: rdi, taps: rsi, n_2_ccomplex_blocks: rdx
  65
  66         mov     %rdx, %rax
  67
  68         # zero accumulators
  69
  70         pxor    %mm4, %mm4              # mm4 = 0 0
  71         pxor    %mm5, %mm5              # mm5 = 0 0
  72         pxor    %mm6, %mm6              # mm6 = 0 0
  73         pxor    %mm7, %mm7              # mm7 = 0 0
  74
  75         shr     $1, %rax                # rax = n_4_float_blocks / 2
  76         movq    0(%rsi), %mm0
  77         movq    8(%rsi), %mm1
  78         pxor    %mm2, %mm2
  79         pxor    %mm3, %mm3
  80         jmp     .L1_test
  81
  82         #
  83         # 8 taps / loop
  84         # something like 6 cycles / loop
  85         #
  86
  87         .p2align 4
  88 .loop1:
  89         pfmul   0(%rdi), %mm0
  90         pfadd   %mm2, %mm6
  91         movq    16(%rsi), %mm2
  92
  93         pfmul   8(%rdi), %mm1
  94         pfadd   %mm3, %mm7
  95         movq    24(%rsi), %mm3
  96
  97         pfmul   16(%rdi), %mm2
  98         pfadd   %mm0, %mm4
  99         movq    32(%rsi), %mm0
 100
 101         pfmul   24(%rdi), %mm3
 102         pfadd   %mm1, %mm5
 103         movq    40(%rsi), %mm1
 104
 105         add     $32, %rdi
 106         add     $32, %rsi
 107 .L1_test:
 108         dec     %rax
 109         jge     .loop1
 110
 111         # We've handled the bulk of multiplies up to here.
 112         # Now accumulate the final two additions and see if original
 113         # n_4_float_blocks was odd.  If so, we've got 4 more
 114         # taps to do.
 115
 116         pfadd   %mm2, %mm6
 117         and     $1, %rdx
 118         pfadd   %mm3, %mm7
 119         je      .Leven
 120
 121         # The count was odd, do 4 more taps.
 122         # Note that we've already got mm0 and mm1 preloaded
 123         # from the main loop.
 124
 125         pfmul   0(%rdi), %mm0
 126         pfadd   %mm0, %mm4
 127         pfmul   8(%rdi), %mm1
 128         pfadd   %mm1, %mm5
 129
 130 .Leven:
 131         # at this point mm4, mm5, mm6 and mm7 contain partial sums
 132
 133         pfadd   %mm7, %mm6
 134         pfadd   %mm5, %mm4
 135         pfadd   %mm6, %mm4
 136         pfacc   %mm4, %mm4
 137
 138         movd    %mm4, -8(%rsp)
 139         movss   -8(%rsp), %xmm0
 140         femms
 141
 142         retq
 143
 144 FUNC_TAIL(float_dotprod_3dnow)
 145         .ident  "Hand coded x86_64 3DNow! assembly"