git.gag.com Git - debian/gnuradio/blob - gnuradio-core/src/lib/filter/complex_dotprod_sse.S

   1 #
   2 # Copyright 2002 Free Software Foundation, Inc.
   3 #
   4 # This file is part of GNU Radio
   5 #
   6 # GNU Radio is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2, or (at your option)
   9 # any later version.
  10 #
  11 # GNU Radio is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with GNU Radio; see the file COPYING.  If not, write to
  18 # the Free Software Foundation, Inc., 51 Franklin Street,
  19 # Boston, MA 02110-1301, USA.
  20 #
  21
  22
  23 # input and taps are guarenteed to be 16 byte aligned.
  24 # n_2_complex_blocks is != 0
  25 #
  26 #
  27 #  complex_dotprod_generic (const short *input,
  28 #                         const float *taps, unsigned n_2_complex_blocks, float *result)
  29 #  {
  30 #    float sum0 = 0;
  31 #    float sum1 = 0;
  32 #    float sum2 = 0;
  33 #    float sum3 = 0;
  34 #
  35 #    do {
  36 #
  37 #      sum0 += input[0] * taps[0];
  38 #      sum1 += input[0] * taps[1];
  39 #      sum2 += input[1] * taps[2];
  40 #      sum3 += input[1] * taps[3];
  41 #
  42 #      input += 2;
  43 #      taps += 4;
  44 #
  45 #    } while (--n_2_complex_blocks != 0);
  46 #
  47 #
  48 #    result[0] = sum0 + sum2;
  49 #    result[1] = sum1 + sum3;
  50 #  }
  51 #
  52
  53 # TODO: prefetch and better scheduling
  54
  55 #include "assembly.h"
  56
  57
  58         .file   "complex_dotprod_sse.S"
  59         .version        "01.01"
  60 .text
  61         .p2align 4
  62 .globl GLOB_SYMB(complex_dotprod_sse)
  63         DEF_FUNC_HEAD(complex_dotprod_sse)
  64 GLOB_SYMB(complex_dotprod_sse):
  65         pushl   %ebp
  66         movl    %esp, %ebp
  67         movl    8(%ebp), %eax           # input
  68         movl    12(%ebp), %edx          # taps
  69         movl    16(%ebp), %ecx
  70
  71
  72         # xmm0 xmm1 xmm2 xmm3 are used to hold taps and the result of mults
  73         # xmm4 xmm5 xmm6 xmm7 are used to hold the accumulated results
  74
  75         xorps   %xmm4, %xmm4            # zero two accumulators
  76         xorps   %xmm5, %xmm5            # xmm5 holds zero for use below
  77
  78         # first handle any non-zero remainder of (n_2_complex_blocks % 4)
  79
  80         andl    $0x3, %ecx
  81         jmp     .L1_test
  82
  83         .p2align 4
  84 .loop1:
  85
  86         pxor    %mm0, %mm0
  87         punpcklwd       0(%eax), %mm0
  88         psrad   $16, %mm0
  89         cvtpi2ps %mm0, %xmm0
  90         shufps  $0x50, %xmm0, %xmm0
  91
  92         mulps   (%edx), %xmm0
  93         addl    $0x10, %edx
  94         addl    $4, %eax
  95         addps   %xmm0, %xmm4
  96 .L1_test:
  97         decl    %ecx
  98         jge     .loop1
  99
 100
 101         # set up for primary loop which is unrolled 4 times
 102
 103         movl    16(%ebp), %ecx
 104         movaps  %xmm5, %xmm6            # zero remaining accumulators
 105         movaps  %xmm5, %xmm7
 106
 107         shrl    $2, %ecx                # n_2_complex_blocks / 4
 108         je      .cleanup                # if zero, take short path
 109
 110         # finish setup and loop priming
 111
 112         pxor    %mm0, %mm0
 113         punpcklwd       0(%eax), %mm0
 114         psrad   $16, %mm0
 115         cvtpi2ps %mm0, %xmm0
 116         shufps  $0x50, %xmm0, %xmm0
 117
 118         movaps  %xmm5, %xmm2
 119
 120         pxor    %mm1, %mm1
 121         punpcklwd       4(%eax), %mm1
 122         psrad   $16, %mm1
 123         cvtpi2ps %mm1, %xmm1
 124         shufps  $0x50, %xmm1, %xmm1
 125
 126         movaps  %xmm5, %xmm3
 127
 128         # we know ecx is not zero, we checked above,
 129         # hence enter loop at top
 130
 131         .p2align 4
 132 .loop2:
 133         mulps   (%edx), %xmm0
 134         addps   %xmm2, %xmm6
 135
 136         pxor    %mm2, %mm2
 137         punpcklwd       8(%eax), %mm2
 138         psrad   $16, %mm2
 139         cvtpi2ps %mm2, %xmm2
 140         shufps  $0x50, %xmm2, %xmm2
 141
 142         mulps   0x10(%edx), %xmm1
 143         addps   %xmm3, %xmm7
 144
 145         pxor    %mm3, %mm3
 146         punpcklwd       12(%eax), %mm3
 147         psrad   $16, %mm3
 148         cvtpi2ps %mm3, %xmm3
 149         shufps  $0x50, %xmm3, %xmm3
 150
 151         mulps   0x20(%edx), %xmm2
 152         addps   %xmm0, %xmm4
 153
 154         pxor    %mm0, %mm0
 155         punpcklwd       16(%eax), %mm0
 156         psrad   $16, %mm0
 157         cvtpi2ps %mm0, %xmm0
 158         shufps  $0x50, %xmm0, %xmm0
 159
 160         mulps   0x30(%edx), %xmm3
 161         addps   %xmm1, %xmm5
 162
 163         pxor    %mm1, %mm1
 164         punpcklwd       20(%eax), %mm1
 165         psrad   $16, %mm1
 166         cvtpi2ps %mm1, %xmm1
 167         shufps  $0x50, %xmm1, %xmm1
 168
 169         addl    $0x40, %edx
 170         addl    $0x10, %eax
 171         decl    %ecx
 172         jne     .loop2
 173
 174         # OK, now we've done with all the multiplies, but
 175         # we still need to handle the unaccumulated
 176         # products in xmm2 and xmm3
 177
 178         addps   %xmm2, %xmm6
 179         addps   %xmm3, %xmm7
 180
 181         # now we want to add all accumulators into xmm4
 182
 183         addps   %xmm5, %xmm4
 184         addps   %xmm6, %xmm7
 185         addps   %xmm7, %xmm4
 186
 187
 188         # At this point, xmm4 contains 2x2 partial sums.  We need
 189         # to compute a "horizontal complex add" across xmm4.
 190
 191 .cleanup:                               # xmm4 = r1 i2 r3 i4
 192         movl    20(%ebp), %eax          # @result
 193         movhlps %xmm4, %xmm0            # xmm0 = ?? ?? r1 r2
 194         addps   %xmm4, %xmm0            # xmm0 = ?? ?? r1+r3 i2+i4
 195         movlps  %xmm0, (%eax)           # store low 2x32 bits (complex) to memory
 196
 197         emms
 198         popl    %ebp
 199         ret
 200
 201 FUNC_TAIL(complex_dotprod_sse)
 202         .ident  "Hand coded x86 SSE assembly"