git.gag.com Git - debian/gnuradio/blob - gnuradio-core/src/lib/filter/3dnow_float_dotprod_simple.S

   1 #
   2 # Copyright 2002 Free Software Foundation, Inc.
   3 #
   4 # This file is part of GNU Radio
   5 #
   6 # GNU Radio is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 3, or (at your option)
   9 # any later version.
  10 #
  11 # GNU Radio is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with GNU Radio; see the file COPYING.  If not, write to
  18 # the Free Software Foundation, Inc., 51 Franklin Street,
  19 # Boston, MA 02110-1301, USA.
  20 #
  21
  22
  23 # input and taps are guarenteed to be 16 byte aligned.
  24 # n_4_float_blocks is != 0
  25 #
  26 #
  27 #  float
  28 #  sse_float_dotprod (const float *input,
  29 #                  const float *taps, unsigned n_4_float_blocks)
  30 #  {
  31 #    float sum0 = 0;
  32 #    float sum1 = 0;
  33 #    float sum2 = 0;
  34 #    float sum3 = 0;
  35 #
  36 #    do {
  37 #
  38 #      sum0 += input[0] * taps[0];
  39 #      sum1 += input[1] * taps[1];
  40 #      sum2 += input[2] * taps[2];
  41 #      sum3 += input[3] * taps[3];
  42 #
  43 #      input += 4;
  44 #      taps += 4;
  45 #
  46 #    } while (--n_4_float_blocks != 0);
  47 #
  48 #
  49 #    return sum0 + sum1 + sum2 + sum3;
  50 #  }
  51 #
  52
  53
  54         .file   "3dnow_float_dotprod_simple.s"
  55         .version        "01.01"
  56 .text
  57         .p2align 4
  58 .globl sse_float_dotprod
  59         .type    sse_float_dotprod,@function
  60 sse_float_dotprod:
  61         pushl   %ebp
  62         movl    %esp, %ebp
  63         movl    8(%ebp), %edx
  64         movl    12(%ebp), %eax
  65         movl    16(%ebp), %ecx
  66
  67
  68         # The plan is to get it computing the correct answer, and
  69         # then to unroll and schedule the inner loop.
  70
  71         pxor    %mm4, %mm4              # mm4 = 0 0
  72         pxor    %mm5, %mm5              # mm5 = 0 0
  73
  74         .p2align 4
  75 .Loop1:
  76         movq    0(%eax), %mm0
  77         movq    8(%eax), %mm1
  78
  79         pfmul   0(%edx), %mm0
  80         pfadd   %mm0, %mm4
  81
  82         pfmul   8(%edx), %mm1
  83         pfadd   %mm1, %mm5
  84
  85         addl    $16, %edx
  86         addl    $16, %eax
  87         decl    %ecx
  88         jne     .Loop1
  89
  90         # at this point mm4 and mm5 contain partial sums
  91
  92         pfadd   %mm5, %mm4
  93         pfacc   %mm4, %mm4
  94         movd    %mm4, 16(%ebp)
  95         femms
  96         flds    16(%ebp)
  97
  98         popl    %ebp
  99         ret
 100 .Lfe1:
 101         .size    sse_float_dotprod,.Lfe1-sse_float_dotprod
 102         .ident  "Hand coded x86 3DNow! assembly"
 103
 104 #if defined(__linux__) && defined(__ELF__)
 105 .section .note.GNU-stack,"",%progbits
 106 #endif