git.gag.com Git - debian/gnuradio/blob - gnuradio-core/src/lib/filter/float_dotprod_sse.S

   1 #
   2 # Copyright 2002 Free Software Foundation, Inc.
   3 #
   4 # This file is part of GNU Radio
   5 #
   6 # GNU Radio is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2, or (at your option)
   9 # any later version.
  10 #
  11 # GNU Radio is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with GNU Radio; see the file COPYING.  If not, write to
  18 # the Free Software Foundation, Inc., 51 Franklin Street,
  19 # Boston, MA 02110-1301, USA.
  20 #
  21
  22
  23 # input and taps are guarenteed to be 16 byte aligned.
  24 # n_4_float_blocks is != 0
  25 #
  26 #
  27 #  float
  28 #  float_dotprod_generic (const float *input,
  29 #                         const float *taps, unsigned n_4_float_blocks)
  30 #  {
  31 #    float sum0 = 0;
  32 #    float sum1 = 0;
  33 #    float sum2 = 0;
  34 #    float sum3 = 0;
  35 #
  36 #    do {
  37 #
  38 #      sum0 += input[0] * taps[0];
  39 #      sum1 += input[1] * taps[1];
  40 #      sum2 += input[2] * taps[2];
  41 #      sum3 += input[3] * taps[3];
  42 #
  43 #      input += 4;
  44 #      taps += 4;
  45 #
  46 #    } while (--n_4_float_blocks != 0);
  47 #
  48 #
  49 #    return sum0 + sum1 + sum2 + sum3;
  50 #  }
  51 #
  52
  53 #include "assembly.h"
  54
  55
  56         .file   "float_dotprod_sse.S"
  57         .version        "01.01"
  58 .text
  59         .p2align 4
  60 .globl GLOB_SYMB(float_dotprod_sse)
  61         DEF_FUNC_HEAD(float_dotprod_sse)
  62 GLOB_SYMB(float_dotprod_sse):
  63         pushl   %ebp
  64         movl    %esp, %ebp
  65         movl    8(%ebp), %edx
  66         movl    12(%ebp), %eax
  67         movl    16(%ebp), %ecx
  68
  69
  70         # xmm0 xmm1 xmm2 xmm3 are used to hold taps and the result of mults
  71         # xmm4 xmm5 xmm6 xmm7 are used to hold the accumulated results
  72
  73         xorps   %xmm4, %xmm4            # zero two accumulators
  74         xorps   %xmm5, %xmm5            # xmm5 holds zero for use below
  75
  76         # first handle any non-zero remainder of (n_4_float_blocks % 4)
  77
  78         andl    $0x3, %ecx
  79         jmp     .L1_test
  80
  81         .p2align 4
  82 .loop1:
  83         movaps  (%eax), %xmm0
  84         mulps   (%edx), %xmm0
  85         addl    $0x10, %edx
  86         addl    $0x10, %eax
  87         addps   %xmm0, %xmm4
  88 .L1_test:
  89         decl    %ecx
  90         jge     .loop1
  91
  92
  93         # set up for primary loop which is unrolled 4 times
  94
  95         movl    16(%ebp), %ecx
  96         movaps  %xmm5, %xmm6            # zero remaining accumulators
  97         movaps  %xmm5, %xmm7
  98
  99         shrl    $2, %ecx                # n_4_float_blocks / 4
 100         je      .cleanup                # if zero, take short path
 101
 102         # finish setup and loop priming
 103
 104         movaps  0x00(%eax), %xmm0
 105         movaps  %xmm5, %xmm2
 106         movaps  0x10(%eax), %xmm1
 107         movaps  %xmm5, %xmm3
 108
 109         # we know ecx is not zero, we checked above,
 110         # hence enter loop at top
 111
 112         .p2align 4
 113 .loop2:
 114         mulps   (%edx), %xmm0
 115         addps   %xmm2, %xmm6
 116         movaps  0x20(%eax), %xmm2
 117
 118         mulps   0x10(%edx), %xmm1
 119         addps   %xmm3, %xmm7
 120         movaps  0x30(%eax), %xmm3
 121
 122         mulps   0x20(%edx), %xmm2
 123         addps   %xmm0, %xmm4
 124         movaps  0x40(%eax), %xmm0
 125
 126         mulps   0x30(%edx), %xmm3
 127         addps   %xmm1, %xmm5
 128         movaps  0x50(%eax), %xmm1
 129
 130         addl    $0x40, %edx
 131         addl    $0x40, %eax
 132         decl    %ecx
 133         jne     .loop2
 134
 135         # OK, now we've done with all the multiplies, but
 136         # we still need to handle the unaccumulated
 137         # products in xmm2 and xmm3
 138
 139         addps   %xmm2, %xmm6
 140         addps   %xmm3, %xmm7
 141
 142         # now we want to add all accumulators into xmm4
 143
 144         addps   %xmm5, %xmm4
 145         addps   %xmm6, %xmm7
 146         addps   %xmm7, %xmm4
 147
 148
 149         # At this point, xmm4 contains 4 partial sums.  We need
 150         # to compute a "horizontal add" across xmm4.
 151         # This is a fairly nasty operation...
 152
 153 .cleanup:                               # xmm4 = d1 d2 d3 d4
 154         xorps   %xmm0, %xmm0            # xmm0 = 0 0 0 0  (may be unnecessary)
 155         movhlps %xmm4, %xmm0            # xmm0 = 0 0 d1 d2
 156         addps   %xmm4, %xmm0            # xmm0 = d1 d2 d1+d3 d2+d4
 157         movaps  %xmm0, %xmm1            # xmm1 = d1 d2 d1+d3 d2+d4
 158         shufps  $0xE1, %xmm4, %xmm1     # xmm1 = d1 d2 d2+d4 d1+d3
 159         addss   %xmm1, %xmm0            # xmm1 = d1 d2 d1+d3 d1+d2+d3+d4
 160         movss   %xmm0, 16(%ebp)         # store low 32 bits (sum) to memory
 161         flds    16(%ebp)                # and load onto FPU stack for return
 162
 163         popl    %ebp
 164         ret
 165
 166 FUNC_TAIL(float_dotprod_sse)
 167         .ident  "Hand coded x86 SSE assembly"