1 /* -------------------------------------------------------------- */
2 /* (C)Copyright 2001,2007, */
3 /* International Business Machines Corporation, */
4 /* Sony Computer Entertainment, Incorporated, */
5 /* Toshiba Corporation, */
7 /* All Rights Reserved. */
9 /* Redistribution and use in source and binary forms, with or */
10 /* without modification, are permitted provided that the */
11 /* following conditions are met: */
13 /* - Redistributions of source code must retain the above copyright*/
14 /* notice, this list of conditions and the following disclaimer. */
16 /* - Redistributions in binary form must reproduce the above */
17 /* copyright notice, this list of conditions and the following */
18 /* disclaimer in the documentation and/or other materials */
19 /* provided with the distribution. */
21 /* - Neither the name of IBM Corporation nor the names of its */
22 /* contributors may be used to endorse or promote products */
23 /* derived from this software without specific prior written */
26 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
27 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
28 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
29 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
30 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR */
31 /* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */
32 /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT */
33 /* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; */
34 /* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
35 /* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */
36 /* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR */
37 /* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, */
38 /* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
39 /* -------------------------------------------------------------- */
40 /* PROLOG END TAG zYx */
42 #define _FFT_1D_R2_H_ 1
48 * Performs a single precision, complex Fast Fourier Transform using
49 * the DFT (Discrete Fourier Transform) with radix-2 decimation in time.
50 * The input <in> is an array of complex numbers of length (1<<log2_size)
51 * entries. The result is returned in the array of complex numbers specified
52 * by <out>. Note: This routine can support an in-place transformation
53 * by specifying <in> and <out> to be the same array.
55 * This implementation utilizes the Cooley-Tukey algorithm consisting
56 * of <log2_size> stages. The basic operation is the butterfly.
58 * p --------------------------> P = p + q*Wi
67 * q --| Wi |-----------------> Q = p - q*Wi
70 * This routine also requires pre-computed twiddle values, W. W is an
71 * array of single precision complex numbers of length 1<<(log2_size-2)
72 * and is computed as follows:
74 * for (i=0; i<n/4; i++)
75 * W[i].real = cos(i * 2*PI/n);
76 * W[i].imag = -sin(i * 2*PI/n);
79 * This array actually only contains the first half of the twiddle
80 * factors. Due for symmetry, the second half of the twiddle factors
81 * are implied and equal:
83 * for (i=0; i<n/4; i++)
84 * W[i+n/4].real = W[i].imag = sin(i * 2*PI/n);
85 * W[i+n/4].imag = -W[i].real = -cos(i * 2*PI/n);
88 * Further symmetry allows one to generate the twiddle factor table
89 * using half the number of trig computations as follows:
93 * for (i=1; i<n/4; i++)
94 * W[i].real = cos(i * 2*PI/n);
95 * W[n/4 - i].imag = -W[i].real;
98 * The complex numbers are packed into quadwords as follows:
101 * array element array elements
102 * -----------------------------------------------------
103 * i | real 2*i | imag 2*i | real 2*i+1 | imag 2*i+1 |
104 * -----------------------------------------------------
109 static __inline void _fft_1d_r2(vector float *out, vector float *in, vector float *W, int log2_size)
114 int n, n_2, n_4, n_8, n_16, n_3_16;
115 int w_stride, w_2stride, w_3stride, w_4stride;
116 int stride, stride_2, stride_4, stride_3_4;
117 vector float *W0, *W1, *W2, *W3;
118 vector float *re0, *re1, *re2, *re3;
119 vector float *im0, *im1, *im2, *im3;
120 vector float *in0, *in1, *in2, *in3, *in4, *in5, *in6, *in7;
121 vector float *out0, *out1, *out2, *out3;
122 vector float tmp0, tmp1;
123 vector float w0_re, w0_im, w1_re, w1_im;
124 vector float w0, w1, w2, w3;
125 vector float src_lo0, src_lo1, src_lo2, src_lo3;
126 vector float src_hi0, src_hi1, src_hi2, src_hi3;
127 vector float dst_lo0, dst_lo1, dst_lo2, dst_lo3;
128 vector float dst_hi0, dst_hi1, dst_hi2, dst_hi3;
129 vector float out_re_lo0, out_re_lo1, out_re_lo2, out_re_lo3;
130 vector float out_im_lo0, out_im_lo1, out_im_lo2, out_im_lo3;
131 vector float out_re_hi0, out_re_hi1, out_re_hi2, out_re_hi3;
132 vector float out_im_hi0, out_im_hi1, out_im_hi2, out_im_hi3;
133 vector float re_lo0, re_lo1, re_lo2, re_lo3;
134 vector float im_lo0, im_lo1, im_lo2, im_lo3;
135 vector float re_hi0, re_hi1, re_hi2, re_hi3;
136 vector float im_hi0, im_hi1, im_hi2, im_hi3;
137 vector float pq_lo0, pq_lo1, pq_lo2, pq_lo3;
138 vector float pq_hi0, pq_hi1, pq_hi2, pq_hi3;
139 vector float re[MAX_FFT_1D_SIZE/4], im[MAX_FFT_1D_SIZE/4]; /* real & imaginary working arrays */
140 vector float ppmm = (vector float) { 1.0f, 1.0f, -1.0f, -1.0f};
141 vector float pmmp = (vector float) { 1.0f, -1.0f, -1.0f, 1.0f};
142 vector unsigned char reverse;
143 vector unsigned char shuf_lo = (vector unsigned char) {
144 0, 1, 2, 3, 4, 5, 6, 7,
145 16,17,18,19, 20,21,22,23};
146 vector unsigned char shuf_hi = (vector unsigned char) {
147 8, 9,10,11, 12,13,14,15,
148 24,25,26,27, 28,29,30,31};
149 vector unsigned char shuf_0202 = (vector unsigned char) {
150 0, 1, 2, 3, 8, 9,10,11,
151 0, 1, 2, 3, 8, 9,10,11};
152 vector unsigned char shuf_1313 = (vector unsigned char) {
153 4, 5, 6, 7, 12,13,14,15,
154 4, 5, 6, 7, 12,13,14,15};
155 vector unsigned char shuf_0303 = (vector unsigned char) {
156 0, 1, 2, 3, 12,13,14,15,
157 0, 1, 2, 3, 12,13,14,15};
158 vector unsigned char shuf_1212 = (vector unsigned char) {
159 4, 5, 6, 7, 8, 9,10,11,
160 4, 5, 6, 7, 8, 9,10,11};
161 vector unsigned char shuf_0415 = (vector unsigned char) {
162 0, 1, 2, 3, 16,17,18,19,
163 4, 5, 6, 7, 20,21,22,23};
164 vector unsigned char shuf_2637 = (vector unsigned char) {
165 8, 9,10,11, 24,25,26,27,
166 12,13,14,15,28,29,30,31};
167 vector unsigned char shuf_0246 = (vector unsigned char) {
168 0, 1, 2, 3, 8, 9,10,11,
169 16,17,18,19,24,25,26,27};
170 vector unsigned char shuf_1357 = (vector unsigned char) {
171 4, 5, 6, 7, 12,13,14,15,
172 20,21,22,23,28,29,30,31};
182 /* Compute a byte reverse shuffle pattern to be used to produce
183 * an address bit swap.
185 reverse = spu_or(spu_slqwbyte(spu_splats((unsigned char)0x80), log2_size),
186 spu_rlmaskqwbyte(((vec_uchar16){15,14,13,12, 11,10,9,8,
187 7, 6, 5, 4, 3, 2,1,0}),
190 /* Perform the first 3 stages of the FFT. These stages differs from
191 * other stages in that the inputs are unscrambled and the data is
192 * reformated into parallel arrays (ie, seperate real and imaginary
193 * arrays). The term "unscramble" means the bit address reverse the
194 * data array. In addition, the first three stages have simple twiddle
197 * stage 2: (1, 0) and (0, -1)
198 * stage 3: (1, 0), (0.707, -0.707), (0, -1), (-0.707, -0.707)
200 * The arrays are processed as two halves, simultaneously. The lo (first
201 * half) and hi (second half). This is done because the scramble
202 * shares source value between each half of the output arrays.
222 w0_re = (vector float) { 1.0f, INV_SQRT_2, 0.0f, -INV_SQRT_2};
223 w0_im = (vector float) { 0.0f, -INV_SQRT_2, -1.0f, -INV_SQRT_2};
226 src_lo0 = in0[i_rev];
227 src_lo1 = in1[i_rev];
228 src_lo2 = in2[i_rev];
229 src_lo3 = in3[i_rev];
231 src_hi0 = in4[i_rev];
232 src_hi1 = in5[i_rev];
233 src_hi2 = in6[i_rev];
234 src_hi3 = in7[i_rev];
238 dst_lo0 = spu_shuffle(src_lo0, src_hi0, shuf_lo);
239 dst_hi0 = spu_shuffle(src_lo0, src_hi0, shuf_hi);
240 dst_lo1 = spu_shuffle(src_lo1, src_hi1, shuf_lo);
241 dst_hi1 = spu_shuffle(src_lo1, src_hi1, shuf_hi);
242 dst_lo2 = spu_shuffle(src_lo2, src_hi2, shuf_lo);
243 dst_hi2 = spu_shuffle(src_lo2, src_hi2, shuf_hi);
244 dst_lo3 = spu_shuffle(src_lo3, src_hi3, shuf_lo);
245 dst_hi3 = spu_shuffle(src_lo3, src_hi3, shuf_hi);
247 /* Perform the stage 1 butterfly. The multiplier constant, ppmm,
248 * is used to control the sign of the operands since a single
249 * quadword contains both of P and Q valule of the butterfly.
251 pq_lo0 = spu_madd(ppmm, dst_lo0, spu_rlqwbyte(dst_lo0, 8));
252 pq_hi0 = spu_madd(ppmm, dst_hi0, spu_rlqwbyte(dst_hi0, 8));
253 pq_lo1 = spu_madd(ppmm, dst_lo1, spu_rlqwbyte(dst_lo1, 8));
254 pq_hi1 = spu_madd(ppmm, dst_hi1, spu_rlqwbyte(dst_hi1, 8));
255 pq_lo2 = spu_madd(ppmm, dst_lo2, spu_rlqwbyte(dst_lo2, 8));
256 pq_hi2 = spu_madd(ppmm, dst_hi2, spu_rlqwbyte(dst_hi2, 8));
257 pq_lo3 = spu_madd(ppmm, dst_lo3, spu_rlqwbyte(dst_lo3, 8));
258 pq_hi3 = spu_madd(ppmm, dst_hi3, spu_rlqwbyte(dst_hi3, 8));
260 /* Perfrom the stage 2 butterfly. For this stage, the
261 * inputs pq are still interleaved (p.real, p.imag, q.real,
262 * q.imag), so we must first re-order the data into
263 * parallel arrays as well as perform the reorder
264 * associated with the twiddle W[n/4], which equals
267 * ie. (A, B) * (0, -1) => (B, -A)
269 re_lo0 = spu_madd(ppmm,
270 spu_shuffle(pq_lo1, pq_lo1, shuf_0303),
271 spu_shuffle(pq_lo0, pq_lo0, shuf_0202));
272 im_lo0 = spu_madd(pmmp,
273 spu_shuffle(pq_lo1, pq_lo1, shuf_1212),
274 spu_shuffle(pq_lo0, pq_lo0, shuf_1313));
276 re_lo1 = spu_madd(ppmm,
277 spu_shuffle(pq_lo3, pq_lo3, shuf_0303),
278 spu_shuffle(pq_lo2, pq_lo2, shuf_0202));
279 im_lo1 = spu_madd(pmmp,
280 spu_shuffle(pq_lo3, pq_lo3, shuf_1212),
281 spu_shuffle(pq_lo2, pq_lo2, shuf_1313));
284 re_hi0 = spu_madd(ppmm,
285 spu_shuffle(pq_hi1, pq_hi1, shuf_0303),
286 spu_shuffle(pq_hi0, pq_hi0, shuf_0202));
287 im_hi0 = spu_madd(pmmp,
288 spu_shuffle(pq_hi1, pq_hi1, shuf_1212),
289 spu_shuffle(pq_hi0, pq_hi0, shuf_1313));
291 re_hi1 = spu_madd(ppmm,
292 spu_shuffle(pq_hi3, pq_hi3, shuf_0303),
293 spu_shuffle(pq_hi2, pq_hi2, shuf_0202));
294 im_hi1 = spu_madd(pmmp,
295 spu_shuffle(pq_hi3, pq_hi3, shuf_1212),
296 spu_shuffle(pq_hi2, pq_hi2, shuf_1313));
299 /* Perform stage 3 butterfly.
301 FFT_1D_BUTTERFLY(re0[0], im0[0], re0[1], im0[1], re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
302 FFT_1D_BUTTERFLY(re1[0], im1[0], re1[1], im1[1], re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
310 i_rev = BIT_SWAP(i, reverse) / 2;
313 /* Process stages 4 to log2_size-2
315 for (stage=4, stride=4; stage<log2_size-1; stage++, stride += stride) {
316 w_stride = n_2 >> stage;
317 w_2stride = n >> stage;
318 w_3stride = w_stride + w_2stride;
319 w_4stride = w_2stride + w_2stride;
326 stride_2 = stride >> 1;
327 stride_4 = stride >> 2;
328 stride_3_4 = stride_2 + stride_4;
331 re1 = re + stride_2; im1 = im + stride_2;
332 re2 = re + stride_4; im2 = im + stride_4;
333 re3 = re + stride_3_4; im3 = im + stride_3_4;
335 for (i=0, offset=0; i<stride_4; i++, offset += w_4stride) {
336 /* Compute the twiddle factors
343 tmp0 = spu_shuffle(w0, w2, shuf_0415);
344 tmp1 = spu_shuffle(w1, w3, shuf_0415);
346 w0_re = spu_shuffle(tmp0, tmp1, shuf_0415);
347 w0_im = spu_shuffle(tmp0, tmp1, shuf_2637);
352 re_lo0 = re0[j]; im_lo0 = im0[j];
353 re_lo1 = re1[j]; im_lo1 = im1[j];
355 re_hi0 = re2[j]; im_hi0 = im2[j];
356 re_hi1 = re3[j]; im_hi1 = im3[j];
358 re_lo2 = re0[k]; im_lo2 = im0[k];
359 re_lo3 = re1[k]; im_lo3 = im1[k];
361 re_hi2 = re2[k]; im_hi2 = im2[k];
362 re_hi3 = re3[k]; im_hi3 = im3[k];
364 FFT_1D_BUTTERFLY (re0[j], im0[j], re1[j], im1[j], re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
365 FFT_1D_BUTTERFLY_HI(re2[j], im2[j], re3[j], im3[j], re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
367 FFT_1D_BUTTERFLY (re0[k], im0[k], re1[k], im1[k], re_lo2, im_lo2, re_lo3, im_lo3, w0_re, w0_im);
368 FFT_1D_BUTTERFLY_HI(re2[k], im2[k], re3[k], im3[k], re_hi2, im_hi2, re_hi3, im_hi3, w0_re, w0_im);
376 /* Process stage log2_size-1. This is identical to the stage processing above
377 * except for this stage the inner loop is only executed once so it is removed
380 w_stride = n_2 >> stage;
381 w_2stride = n >> stage;
382 w_3stride = w_stride + w_2stride;
383 w_4stride = w_2stride + w_2stride;
385 stride_2 = stride >> 1;
386 stride_4 = stride >> 2;
388 stride_3_4 = stride_2 + stride_4;
391 re1 = re + stride_2; im1 = im + stride_2;
392 re2 = re + stride_4; im2 = im + stride_4;
393 re3 = re + stride_3_4; im3 = im + stride_3_4;
395 for (i=0, offset=0; i<stride_4; i++, offset += w_4stride) {
396 /* Compute the twiddle factors
399 w1 = W[offset + w_stride];
400 w2 = W[offset + w_2stride];
401 w3 = W[offset + w_3stride];
403 tmp0 = spu_shuffle(w0, w2, shuf_0415);
404 tmp1 = spu_shuffle(w1, w3, shuf_0415);
406 w0_re = spu_shuffle(tmp0, tmp1, shuf_0415);
407 w0_im = spu_shuffle(tmp0, tmp1, shuf_2637);
412 re_lo0 = re0[j]; im_lo0 = im0[j];
413 re_lo1 = re1[j]; im_lo1 = im1[j];
415 re_hi0 = re2[j]; im_hi0 = im2[j];
416 re_hi1 = re3[j]; im_hi1 = im3[j];
418 re_lo2 = re0[k]; im_lo2 = im0[k];
419 re_lo3 = re1[k]; im_lo3 = im1[k];
421 re_hi2 = re2[k]; im_hi2 = im2[k];
422 re_hi3 = re3[k]; im_hi3 = im3[k];
424 FFT_1D_BUTTERFLY (re0[j], im0[j], re1[j], im1[j], re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
425 FFT_1D_BUTTERFLY_HI(re2[j], im2[j], re3[j], im3[j], re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
427 FFT_1D_BUTTERFLY (re0[k], im0[k], re1[k], im1[k], re_lo2, im_lo2, re_lo3, im_lo3, w0_re, w0_im);
428 FFT_1D_BUTTERFLY_HI(re2[k], im2[k], re3[k], im3[k], re_hi2, im_hi2, re_hi3, im_hi3, w0_re, w0_im);
432 /* Process the final stage (stage log2_size). For this stage,
433 * reformat the data from parallel arrays back into
434 * interleaved arrays,storing the result into <in>.
436 * This loop has been manually unrolled by 2 to improve
437 * dual issue rates and reduce stalls. This unrolling
438 * forces a minimum FFT size of 32.
458 /* Fetch the twiddle factors
467 w0_re = spu_shuffle(w0, w1, shuf_0246);
468 w0_im = spu_shuffle(w0, w1, shuf_1357);
469 w1_re = spu_shuffle(w2, w3, shuf_0246);
470 w1_im = spu_shuffle(w2, w3, shuf_1357);
472 /* Fetch the butterfly inputs, reals and imaginaries
474 re_lo0 = re0[0]; im_lo0 = im0[0];
475 re_lo1 = re1[0]; im_lo1 = im1[0];
476 re_lo2 = re0[1]; im_lo2 = im0[1];
477 re_lo3 = re1[1]; im_lo3 = im1[1];
479 re_hi0 = re2[0]; im_hi0 = im2[0];
480 re_hi1 = re3[0]; im_hi1 = im3[0];
481 re_hi2 = re2[1]; im_hi2 = im2[1];
482 re_hi3 = re3[1]; im_hi3 = im3[1];
489 /* Perform the butterflys
491 FFT_1D_BUTTERFLY (out_re_lo0, out_im_lo0, out_re_lo1, out_im_lo1, re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
492 FFT_1D_BUTTERFLY (out_re_lo2, out_im_lo2, out_re_lo3, out_im_lo3, re_lo2, im_lo2, re_lo3, im_lo3, w1_re, w1_im);
494 FFT_1D_BUTTERFLY_HI(out_re_hi0, out_im_hi0, out_re_hi1, out_im_hi1, re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
495 FFT_1D_BUTTERFLY_HI(out_re_hi2, out_im_hi2, out_re_hi3, out_im_hi3, re_hi2, im_hi2, re_hi3, im_hi3, w1_re, w1_im);
497 /* Interleave the results and store them into the output buffers (ie,
498 * the original input buffers.
500 out0[0] = spu_shuffle(out_re_lo0, out_im_lo0, shuf_0415);
501 out0[1] = spu_shuffle(out_re_lo0, out_im_lo0, shuf_2637);
502 out0[2] = spu_shuffle(out_re_lo2, out_im_lo2, shuf_0415);
503 out0[3] = spu_shuffle(out_re_lo2, out_im_lo2, shuf_2637);
505 out1[0] = spu_shuffle(out_re_lo1, out_im_lo1, shuf_0415);
506 out1[1] = spu_shuffle(out_re_lo1, out_im_lo1, shuf_2637);
507 out1[2] = spu_shuffle(out_re_lo3, out_im_lo3, shuf_0415);
508 out1[3] = spu_shuffle(out_re_lo3, out_im_lo3, shuf_2637);
510 out2[0] = spu_shuffle(out_re_hi0, out_im_hi0, shuf_0415);
511 out2[1] = spu_shuffle(out_re_hi0, out_im_hi0, shuf_2637);
512 out2[2] = spu_shuffle(out_re_hi2, out_im_hi2, shuf_0415);
513 out2[3] = spu_shuffle(out_re_hi2, out_im_hi2, shuf_2637);
515 out3[0] = spu_shuffle(out_re_hi1, out_im_hi1, shuf_0415);
516 out3[1] = spu_shuffle(out_re_hi1, out_im_hi1, shuf_2637);
517 out3[2] = spu_shuffle(out_re_hi3, out_im_hi3, shuf_0415);
518 out3[3] = spu_shuffle(out_re_hi3, out_im_hi3, shuf_2637);
529 #endif /* _FFT_1D_R2_H_ */