git.gag.com Git - debian/gnuradio/blob - gnuradio-core/src/lib/filter/ccomplex_dotprod_sse.S

   1 #
   2 # Copyright 2002 Free Software Foundation, Inc.
   3 #
   4 # This file is part of GNU Radio
   5 #
   6 # GNU Radio is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2, or (at your option)
   9 # any later version.
  10 #
  11 # GNU Radio is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with GNU Radio; see the file COPYING.  If not, write to
  18 # the Free Software Foundation, Inc., 51 Franklin Street,
  19 # Boston, MA 02110-1301, USA.
  20 #
  21
  22
  23 # input and taps are guarenteed to be 16 byte aligned.
  24 # n_2_ccomplex_blocks is != 0
  25 #
  26 #
  27 #  ccomplex_dotprod_generic (const float *input,
  28 #                         const float *taps, unsigned n_2_ccomplex_blocks, float *result)
  29 #  {
  30 #    float sum0 = 0;
  31 #    float sum1 = 0;
  32 #    float sum2 = 0;
  33 #    float sum3 = 0;
  34 #
  35 #    do {
  36 #
  37 #      sum0 += input[0] * taps[0] - input[1] * taps[1];
  38 #      sum1 += input[0] * taps[1] + input[1] * taps[0];
  39 #      sum2 += input[2] * taps[2] - input[3] * taps[3];
  40 #      sum3 += input[2] * taps[3] + input[3] * taps[2];
  41 #
  42 #      input += 4;
  43 #      taps += 4;
  44 #
  45 #    } while (--n_2_ccomplex_blocks != 0);
  46 #
  47 #
  48 #    result[0] = sum0 + sum2;
  49 #    result[1] = sum1 + sum3;
  50 #  }
  51 #
  52
  53 # TODO: prefetch and better scheduling
  54
  55 #include "assembly.h"
  56
  57         .file   "ccomplex_dotprod_sse.S"
  58         .version        "01.01"
  59 .text
  60         .p2align 4
  61 .globl GLOB_SYMB(ccomplex_dotprod_sse)
  62         DEF_FUNC_HEAD(ccomplex_dotprod_sse)
  63 GLOB_SYMB(ccomplex_dotprod_sse):
  64         pushl   %ebp
  65         movl    %esp, %ebp
  66         movl    8(%ebp), %eax           # input
  67         movl    12(%ebp), %edx          # taps
  68         movl    16(%ebp), %ecx          # n_2_ccomplex_blocks
  69
  70         xorps   %xmm6, %xmm6            # zero accumulators
  71
  72         movaps  0(%eax), %xmm0
  73
  74         xorps   %xmm7, %xmm7            # zero accumulators
  75
  76         movaps  0(%edx), %xmm2
  77
  78         shrl    $1, %ecx                # ecx = n_2_ccomplex_blocks / 2
  79
  80         jmp     .L1_test
  81
  82         #
  83         # 4 taps / loop
  84         # something like ?? cycles / loop
  85         #
  86
  87         .p2align 4
  88 .loop1:
  89
  90 # complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000
  91 #
  92 #       movaps  (%eax), %xmmA
  93 #       movaps  (%edx), %xmmB
  94 #
  95 #       movaps  %xmmA, %xmmZ
  96 #       shufps  $0xb1, %xmmZ, %xmmZ     # swap internals
  97 #
  98 #       mulps   %xmmB, %xmmA
  99 #       mulps   %xmmZ, %xmmB
 100 #
 101 #       # SSE replacement for: pfpnacc %xmmB, %xmmA
 102 #       xorps   %xmmPN, %xmmA
 103 #       movaps  %xmmA, %xmmZ
 104 #       unpcklps %xmmB, %xmmA
 105 #       unpckhps %xmmB, %xmmZ
 106 #       movaps  %xmmZ, %xmmY
 107 #       shufps  $0x44, %xmmA, %xmmZ     # b01000100
 108 #       shufps  $0xee, %xmmY, %xmmA     # b11101110
 109 #       addps   %xmmZ, %xmmA
 110 #
 111 #       addps   %xmmA, %xmmC
 112
 113 # A=xmm0, B=xmm2, Z=xmm4
 114 # A'=xmm1, B'=xmm3, Z'=xmm5
 115
 116         movaps  16(%eax), %xmm1
 117
 118         movaps  %xmm0, %xmm4
 119         mulps   %xmm2, %xmm0
 120
 121         shufps  $0xb1, %xmm4, %xmm4     # swap internals
 122         movaps  16(%edx), %xmm3
 123         movaps  %xmm1, %xmm5
 124         addps   %xmm0, %xmm6
 125         mulps   %xmm3, %xmm1
 126         shufps  $0xb1, %xmm5, %xmm5     # swap internals
 127         addps   %xmm1, %xmm6
 128         mulps   %xmm4, %xmm2
 129         movaps  32(%eax), %xmm0
 130         addps   %xmm2, %xmm7
 131         mulps   %xmm5, %xmm3
 132
 133         addl    $32, %eax
 134
 135         movaps  32(%edx), %xmm2
 136         addps   %xmm3, %xmm7
 137
 138         addl    $32, %edx
 139
 140
 141
 142 .L1_test:
 143         decl    %ecx
 144         jge     .loop1
 145
 146         # We've handled the bulk of multiplies up to here.
 147         # Let's sse if original n_2_ccomplex_blocks was odd.
 148         # If so, we've got 2 more taps to do.
 149
 150         movl    16(%ebp), %ecx          # n_2_ccomplex_blocks
 151         andl    $1, %ecx
 152         je      .Leven
 153
 154         # The count was odd, do 2 more taps.
 155         # Note that we've already got mm0/mm2 preloaded
 156         # from the main loop.
 157
 158         movaps  %xmm0, %xmm4
 159         mulps   %xmm2, %xmm0
 160         shufps  $0xb1, %xmm4, %xmm4     # swap internals
 161         addps   %xmm0, %xmm6
 162         mulps   %xmm4, %xmm2
 163         addps   %xmm2, %xmm7
 164
 165
 166 .Leven:
 167         # neg inversor
 168         xorps   %xmm1, %xmm1
 169         movl    $0x80000000, 16(%ebp)
 170         movss   16(%ebp), %xmm1
 171         shufps  $0x11, %xmm1, %xmm1     # b00010001 # 0 -0 0 -0
 172
 173         # pfpnacc
 174         xorps   %xmm1, %xmm6
 175
 176         movaps  %xmm6, %xmm2
 177         unpcklps %xmm7, %xmm6
 178         unpckhps %xmm7, %xmm2
 179         movaps  %xmm2, %xmm3
 180         shufps  $0x44, %xmm6, %xmm2     # b01000100
 181         shufps  $0xee, %xmm3, %xmm6     # b11101110
 182         addps   %xmm2, %xmm6
 183
 184                                         # xmm6 = r1 i2 r3 i4
 185         movl    20(%ebp), %eax          # @result
 186         movhlps %xmm6, %xmm4            # xmm4 = r3 i4 ?? ??
 187         addps   %xmm4, %xmm6            # xmm6 = r1+r3 i2+i4 ?? ??
 188         movlps  %xmm6, (%eax)           # store low 2x32 bits (complex) to memory
 189
 190         popl    %ebp
 191         ret
 192
 193 FUNC_TAIL(ccomplex_dotprod_sse)
 194         .ident  "Hand coded x86 SSE assembly"