git.gag.com Git - debian/gnuradio/blob - gnuradio-core/src/lib/filter/dotprod_fff_altivec.c

   1 /* -*- c++ -*- */
   2 /*
   3  * Copyright 2008 Free Software Foundation, Inc.
   4  *
   5  * This file is part of GNU Radio
   6  *
   7  * GNU Radio is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation; either version 3, or (at your option)
  10  * any later version.
  11  *
  12  * GNU Radio is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License along
  18  * with this program; if not, write to the Free Software Foundation, Inc.,
  19  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20  */
  21
  22 #ifdef HAVE_CONFIG_H
  23 #include <config.h>
  24 #endif
  25 #include <dotprod_fff_altivec.h>
  26 #include <gr_altivec.h>
  27
  28 /*!
  29  * \param x any value
  30  * \param pow2 must be a power of 2
  31  * \returns \p x rounded down to a multiple of \p pow2.
  32  */
  33 static inline size_t
  34 gr_p2_round_down(size_t x, size_t pow2)
  35 {
  36   return x & -pow2;
  37 }
  38
  39
  40 #if 0
  41
  42 float
  43 dotprod_fff_altivec(const float *a, const float *b, size_t n)
  44 {
  45   float sum = 0;
  46   for (size_t i = 0; i < n; i++){
  47     sum += a[i] * b[i];
  48   }
  49   return sum;
  50 }
  51
  52 #else
  53
  54 /*
  55  *  preconditions:
  56  *
  57  *    n > 0 and a multiple of 4
  58  *    a   4-byte aligned
  59  *    b  16-byte aligned
  60  */
  61 float
  62 dotprod_fff_altivec(const float *_a, const float *_b, size_t n)
  63 {
  64   const vec_float4 *a = (const vec_float4 *) _a;
  65   const vec_float4 *b = (const vec_float4 *) _b;
  66
  67   static const size_t UNROLL_CNT = 4;
  68
  69   n = gr_p2_round_down(n, 4);
  70   size_t loop_cnt = n / (UNROLL_CNT * FLOATS_PER_VEC);
  71   size_t nleft = n % (UNROLL_CNT * FLOATS_PER_VEC);
  72
  73   // printf("n = %zd, loop_cnt = %zd, nleft = %zd\n", n, loop_cnt, nleft);
  74
  75   // Used with vperm to build a* from p*
  76   vec_uchar16 lvsl_a = vec_lvsl(0, _a);
  77
  78   vec_float4 p0, p1, p2, p3;
  79   vec_float4 a0, a1, a2, a3;
  80   vec_float4 b0, b1, b2, b3;
  81   vec_float4 acc0 = {0, 0, 0, 0};
  82   vec_float4 acc1 = {0, 0, 0, 0};
  83   vec_float4 acc2 = {0, 0, 0, 0};
  84   vec_float4 acc3 = {0, 0, 0, 0};
  85
  86   // wind in
  87
  88   p0 = vec_ld(0*VS, a);
  89   p1 = vec_ld(1*VS, a);
  90   p2 = vec_ld(2*VS, a);
  91   p3 = vec_ld(3*VS, a);
  92   a += UNROLL_CNT;
  93
  94   a0 = vec_perm(p0, p1, lvsl_a);
  95   b0 = vec_ld(0*VS, b);
  96   p0 = vec_ld(0*VS, a);
  97
  98   size_t i;
  99   for (i = 0; i < loop_cnt; i++){
 100
 101     a1 = vec_perm(p1, p2, lvsl_a);
 102     b1 = vec_ld(1*VS, b);
 103     p1 = vec_ld(1*VS, a);
 104     acc0 = vec_madd(a0, b0, acc0);
 105
 106     a2 = vec_perm(p2, p3, lvsl_a);
 107     b2 = vec_ld(2*VS, b);
 108     p2 = vec_ld(2*VS, a);
 109     acc1 = vec_madd(a1, b1, acc1);
 110
 111     a3 = vec_perm(p3, p0, lvsl_a);
 112     b3 = vec_ld(3*VS, b);
 113     p3 = vec_ld(3*VS, a);
 114     acc2 = vec_madd(a2, b2, acc2);
 115
 116     a += UNROLL_CNT;
 117     b += UNROLL_CNT;
 118
 119     a0 = vec_perm(p0, p1, lvsl_a);
 120     b0 = vec_ld(0*VS, b);
 121     p0 = vec_ld(0*VS, a);
 122     acc3 = vec_madd(a3, b3, acc3);
 123   }
 124
 125   /*
 126    * The compiler ought to be able to figure out that 0, 4, 8 and 12
 127    * are the only possible values for nleft.
 128    */
 129   switch (nleft){
 130   case 0:
 131     break;
 132
 133   case 4:
 134     acc0 = vec_madd(a0, b0, acc0);
 135     break;
 136
 137   case 8:
 138     a1 = vec_perm(p1, p2, lvsl_a);
 139     b1 = vec_ld(1*VS, b);
 140     acc0 = vec_madd(a0, b0, acc0);
 141     acc1 = vec_madd(a1, b1, acc1);
 142     break;
 143
 144   case 12:
 145     a1 = vec_perm(p1, p2, lvsl_a);
 146     b1 = vec_ld(1*VS, b);
 147     acc0 = vec_madd(a0, b0, acc0);
 148     a2 = vec_perm(p2, p3, lvsl_a);
 149     b2 = vec_ld(2*VS, b);
 150     acc1 = vec_madd(a1, b1, acc1);
 151     acc2 = vec_madd(a2, b2, acc2);
 152     break;
 153   }
 154
 155   acc0 = acc0 + acc1;
 156   acc2 = acc2 + acc3;
 157   acc0 = acc0 + acc2;
 158
 159   return horizontal_add_f(acc0);
 160 }
 161
 162 #endif