git.gag.com Git - debian/gnuradio/blob - gnuradio-core/src/lib/filter/fcomplex_dotprod_3dnow.S

   1 #
   2 # Copyright 2002 Free Software Foundation, Inc.
   3 #
   4 # This file is part of GNU Radio
   5 #
   6 # GNU Radio is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 3, or (at your option)
   9 # any later version.
  10 #
  11 # GNU Radio is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with GNU Radio; see the file COPYING.  If not, write to
  18 # the Free Software Foundation, Inc., 51 Franklin Street,
  19 # Boston, MA 02110-1301, USA.
  20 #
  21
  22 # input and taps are guarenteed to be 16 byte aligned.
  23 # n_2_complex_blocks is != 0
  24 #
  25 #
  26 #  fcomplex_dotprod_generic (const float *input,
  27 #                         const float *taps, unsigned n_2_complex_blocks, float *result)
  28 #  {
  29 #    float sum0 = 0;
  30 #    float sum1 = 0;
  31 #    float sum2 = 0;
  32 #    float sum3 = 0;
  33 #
  34 #    do {
  35 #
  36 #      sum0 += input[0] * taps[0];
  37 #      sum1 += input[0] * taps[1];
  38 #      sum2 += input[1] * taps[2];
  39 #      sum3 += input[1] * taps[3];
  40 #
  41 #      input += 2;
  42 #      taps += 4;
  43 #
  44 #    } while (--n_2_complex_blocks != 0);
  45 #
  46 #
  47 #    result[0] = sum0 + sum2;
  48 #    result[1] = sum1 + sum3;
  49 #  }
  50 #
  51
  52 #include "assembly.h"
  53
  54
  55         .file   "fcomplex_dotprod_3dnow.S"
  56         .version        "01.01"
  57 .text
  58         .p2align 4
  59 .globl GLOB_SYMB(fcomplex_dotprod_3dnow)
  60         DEF_FUNC_HEAD(fcomplex_dotprod_3dnow)
  61 GLOB_SYMB(fcomplex_dotprod_3dnow):
  62         pushl   %ebp
  63         movl    %esp, %ebp
  64         movl    8(%ebp), %eax           # input
  65         movl    12(%ebp), %edx          # taps
  66         movl    16(%ebp), %ecx
  67
  68         # zero accumulators
  69
  70         pxor    %mm4, %mm4              # mm4 = 0 0
  71         pxor    %mm5, %mm5              # mm5 = 0 0
  72         pxor    %mm6, %mm6              # mm6 = 0 0
  73         pxor    %mm7, %mm7              # mm7 = 0 0
  74
  75         shrl    $1, %ecx                # ecx = n_2_complex_blocks / 2
  76
  77         movq    0(%eax), %mm0
  78
  79         pxor    %mm2, %mm2
  80         pxor    %mm3, %mm3
  81
  82         movq    %mm0, %mm1
  83         punpckldq       %mm0, %mm0
  84         punpckhdq       %mm1, %mm1
  85
  86
  87         jmp     .L1_test
  88
  89         #
  90         # 4 taps / loop
  91         # something like ?? cycles / loop
  92         #
  93
  94         .p2align 4
  95 .Loop1:
  96         pfmul   0(%edx), %mm0
  97         pfadd   %mm2, %mm6
  98
  99         movq    8(%eax), %mm2
 100
 101         pfadd   %mm3, %mm7
 102
 103         pfmul   8(%edx), %mm1
 104
 105         movq    %mm2, %mm3
 106         punpckldq       %mm2, %mm2
 107         punpckhdq       %mm3, %mm3
 108
 109
 110         pfmul   16(%edx), %mm2
 111         pfadd   %mm0, %mm4
 112
 113         movq    16(%eax), %mm0
 114
 115         pfadd   %mm1, %mm5
 116
 117         movq    %mm0, %mm1
 118         punpckldq       %mm0, %mm0
 119
 120         pfmul   24(%edx), %mm3
 121
 122         punpckhdq       %mm1, %mm1
 123
 124
 125 #TODO: add prefetch?
 126
 127         addl    $32, %edx
 128         addl    $16, %eax
 129
 130 .L1_test:
 131         decl    %ecx
 132         jge     .Loop1
 133
 134         # We've handled the bulk of multiplies up to here.
 135         # Now accumulate the final two additions and see if original
 136         # n_2_complex_blocks was odd.  If so, we've got 2 more
 137         # taps to do.
 138
 139         movl    16(%ebp), %ecx
 140         pfadd   %mm2, %mm6
 141         andl    $1, %ecx
 142         pfadd   %mm3, %mm7
 143         je      .Leven
 144
 145         # The count was odd, do 2 more taps.
 146         # Note that we've already got mm0 and mm1 preloaded
 147         # from the main loop.
 148
 149         pfmul   0(%edx), %mm0
 150         pfadd   %mm0, %mm4
 151         pfmul   8(%edx), %mm1
 152         pfadd   %mm1, %mm5
 153
 154
 155 .Leven:
 156         # at this point mm4, mm5, mm6 and mm7 contain partial sums
 157
 158         pfadd   %mm7, %mm6
 159         pfadd   %mm5, %mm4
 160
 161         movl    20(%ebp), %eax          # result
 162
 163         pfadd   %mm6, %mm4
 164
 165         movq    %mm4, (%eax)
 166         femms
 167
 168         popl    %ebp
 169         ret
 170
 171 FUNC_TAIL(fcomplex_dotprod_3dnow)
 172         .ident  "Hand coded x86 3DNow! assembly"
 173
 174 #if defined(__linux__) && defined(__ELF__)
 175 .section .note.GNU-stack,"",%progbits
 176 #endif