3 * Copyright 2008 Free Software Foundation, Inc.
5 * This file is part of GNU Radio
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 #include <dotprod_fff_altivec.h>
26 #include <gr_altivec.h>
30 * \param pow2 must be a power of 2
31 * \returns \p x rounded down to a multiple of \p pow2.
34 gr_p2_round_down(size_t x, size_t pow2)
43 dotprod_fff_altivec(const float *a, const float *b, size_t n)
46 for (size_t i = 0; i < n; i++){
57 * n > 0 and a multiple of 4
62 dotprod_fff_altivec(const float *_a, const float *_b, size_t n)
64 const vec_float4 *a = (const vec_float4 *) _a;
65 const vec_float4 *b = (const vec_float4 *) _b;
67 static const size_t UNROLL_CNT = 4;
69 n = gr_p2_round_down(n, 4);
70 size_t loop_cnt = n / (UNROLL_CNT * FLOATS_PER_VEC);
71 size_t nleft = n % (UNROLL_CNT * FLOATS_PER_VEC);
73 // printf("n = %zd, loop_cnt = %zd, nleft = %zd\n", n, loop_cnt, nleft);
75 // Used with vperm to build a* from p*
76 vec_uchar16 lvsl_a = vec_lvsl(0, _a);
78 vec_float4 p0, p1, p2, p3;
79 vec_float4 a0, a1, a2, a3;
80 vec_float4 b0, b1, b2, b3;
81 vec_float4 acc0 = {0, 0, 0, 0};
82 vec_float4 acc1 = {0, 0, 0, 0};
83 vec_float4 acc2 = {0, 0, 0, 0};
84 vec_float4 acc3 = {0, 0, 0, 0};
94 a0 = vec_perm(p0, p1, lvsl_a);
99 for (i = 0; i < loop_cnt; i++){
101 a1 = vec_perm(p1, p2, lvsl_a);
102 b1 = vec_ld(1*VS, b);
103 p1 = vec_ld(1*VS, a);
104 acc0 = vec_madd(a0, b0, acc0);
106 a2 = vec_perm(p2, p3, lvsl_a);
107 b2 = vec_ld(2*VS, b);
108 p2 = vec_ld(2*VS, a);
109 acc1 = vec_madd(a1, b1, acc1);
111 a3 = vec_perm(p3, p0, lvsl_a);
112 b3 = vec_ld(3*VS, b);
113 p3 = vec_ld(3*VS, a);
114 acc2 = vec_madd(a2, b2, acc2);
119 a0 = vec_perm(p0, p1, lvsl_a);
120 b0 = vec_ld(0*VS, b);
121 p0 = vec_ld(0*VS, a);
122 acc3 = vec_madd(a3, b3, acc3);
126 * The compiler ought to be able to figure out that 0, 4, 8 and 12
127 * are the only possible values for nleft.
134 acc0 = vec_madd(a0, b0, acc0);
138 a1 = vec_perm(p1, p2, lvsl_a);
139 b1 = vec_ld(1*VS, b);
140 acc0 = vec_madd(a0, b0, acc0);
141 acc1 = vec_madd(a1, b1, acc1);
145 a1 = vec_perm(p1, p2, lvsl_a);
146 b1 = vec_ld(1*VS, b);
147 acc0 = vec_madd(a0, b0, acc0);
148 a2 = vec_perm(p2, p3, lvsl_a);
149 b2 = vec_ld(2*VS, b);
150 acc1 = vec_madd(a1, b1, acc1);
151 acc2 = vec_madd(a2, b2, acc2);
159 return horizontal_add_f(acc0);