git.gag.com Git - debian/gnuradio/blob - gnuradio-core/src/lib/filter/fcomplex_dotprod_3dnow64.S

   1 #
   2 # Copyright 2002,2005 Free Software Foundation, Inc.
   3 #
   4 # This file is part of GNU Radio
   5 #
   6 # GNU Radio is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 3, or (at your option)
   9 # any later version.
  10 #
  11 # GNU Radio is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with GNU Radio; see the file COPYING.  If not, write to
  18 # the Free Software Foundation, Inc., 51 Franklin Street,
  19 # Boston, MA 02110-1301, USA.
  20 #
  21
  22 # input and taps are guarenteed to be 16 byte aligned.
  23 # n_2_complex_blocks is != 0
  24 #
  25 #
  26 #  fcomplex_dotprod_generic (const float *input,
  27 #                         const float *taps, unsigned n_2_complex_blocks, float *result)
  28 #  {
  29 #    float sum0 = 0;
  30 #    float sum1 = 0;
  31 #    float sum2 = 0;
  32 #    float sum3 = 0;
  33 #
  34 #    do {
  35 #
  36 #      sum0 += input[0] * taps[0];
  37 #      sum1 += input[0] * taps[1];
  38 #      sum2 += input[1] * taps[2];
  39 #      sum3 += input[1] * taps[3];
  40 #
  41 #      input += 2;
  42 #      taps += 4;
  43 #
  44 #    } while (--n_2_complex_blocks != 0);
  45 #
  46 #
  47 #    result[0] = sum0 + sum2;
  48 #    result[1] = sum1 + sum3;
  49 #  }
  50 #
  51
  52 #include "assembly.h"
  53
  54
  55         .file   "fcomplex_dotprod_3dnow64.S"
  56         .version        "01.01"
  57 .text
  58         .p2align 4
  59 .globl GLOB_SYMB(fcomplex_dotprod_3dnow)
  60         DEF_FUNC_HEAD(fcomplex_dotprod_3dnow)
  61 GLOB_SYMB(fcomplex_dotprod_3dnow):
  62
  63         # intput: rdi, taps: rsi, n_2_ccomplex_blocks: rdx, result: rcx
  64
  65         mov     %rdx, %rax
  66
  67         # zero accumulators
  68
  69         pxor    %mm4, %mm4              # mm4 = 0 0
  70         pxor    %mm5, %mm5              # mm5 = 0 0
  71         pxor    %mm6, %mm6              # mm6 = 0 0
  72         pxor    %mm7, %mm7              # mm7 = 0 0
  73
  74         shr     $1, %rax                # rax = n_2_complex_blocks / 2
  75
  76         movq    0(%rdi), %mm0
  77
  78         pxor    %mm2, %mm2
  79         pxor    %mm3, %mm3
  80
  81         movq    %mm0, %mm1
  82         punpckldq       %mm0, %mm0
  83         punpckhdq       %mm1, %mm1
  84
  85
  86         jmp     .L1_test
  87
  88         #
  89         # 4 taps / loop
  90         # something like ?? cycles / loop
  91         #
  92
  93         .p2align 4
  94 .loop1:
  95         pfmul   0(%rsi), %mm0
  96         pfadd   %mm2, %mm6
  97
  98         movq    8(%rdi), %mm2
  99
 100         pfadd   %mm3, %mm7
 101
 102         pfmul   8(%rsi), %mm1
 103
 104         movq    %mm2, %mm3
 105         punpckldq       %mm2, %mm2
 106         punpckhdq       %mm3, %mm3
 107
 108
 109         pfmul   16(%rsi), %mm2
 110         pfadd   %mm0, %mm4
 111
 112         movq    16(%rdi), %mm0
 113
 114         pfadd   %mm1, %mm5
 115
 116         movq    %mm0, %mm1
 117         punpckldq       %mm0, %mm0
 118
 119         pfmul   24(%rsi), %mm3
 120
 121         punpckhdq       %mm1, %mm1
 122
 123
 124 #TODO: add prefetch?
 125
 126         add     $32, %rsi
 127         add     $16, %rdi
 128
 129 .L1_test:
 130         dec     %rax
 131         jge     .loop1
 132
 133         # We've handled the bulk of multiplies up to here.
 134         # Now accumulate the final two additions and see if original
 135         # n_2_complex_blocks was odd.  If so, we've got 2 more
 136         # taps to do.
 137
 138         pfadd   %mm2, %mm6
 139         and     $1, %rdx
 140         pfadd   %mm3, %mm7
 141         je      .Leven
 142
 143         # The count was odd, do 2 more taps.
 144         # Note that we've already got mm0 and mm1 preloaded
 145         # from the main loop.
 146
 147         pfmul   0(%rsi), %mm0
 148         pfadd   %mm0, %mm4
 149         pfmul   8(%rsi), %mm1
 150         pfadd   %mm1, %mm5
 151
 152
 153 .Leven:
 154         # at this point mm4, mm5, mm6 and mm7 contain partial sums
 155
 156         pfadd   %mm7, %mm6
 157         pfadd   %mm5, %mm4
 158         pfadd   %mm6, %mm4
 159
 160         movq    %mm4, (%rcx)            # result
 161         femms
 162
 163         retq
 164
 165 FUNC_TAIL(fcomplex_dotprod_3dnow)
 166         .ident  "Hand coded x86_64 3DNow! assembly"