git.gag.com Git - debian/gnuradio/blob - gcell/include/gcell/spu/fft_1d_r2.h

   1 /* --------------------------------------------------------------  */
   2 /* (C)Copyright 2001,2007,                                         */
   3 /* International Business Machines Corporation,                    */
   4 /* Sony Computer Entertainment, Incorporated,                      */
   5 /* Toshiba Corporation,                                            */
   6 /*                                                                 */
   7 /* All Rights Reserved.                                            */
   8 /*                                                                 */
   9 /* Redistribution and use in source and binary forms, with or      */
  10 /* without modification, are permitted provided that the           */
  11 /* following conditions are met:                                   */
  12 /*                                                                 */
  13 /* - Redistributions of source code must retain the above copyright*/
  14 /*   notice, this list of conditions and the following disclaimer. */
  15 /*                                                                 */
  16 /* - Redistributions in binary form must reproduce the above       */
  17 /*   copyright notice, this list of conditions and the following   */
  18 /*   disclaimer in the documentation and/or other materials        */
  19 /*   provided with the distribution.                               */
  20 /*                                                                 */
  21 /* - Neither the name of IBM Corporation nor the names of its      */
  22 /*   contributors may be used to endorse or promote products       */
  23 /*   derived from this software without specific prior written     */
  24 /*   permission.                                                   */
  25 /*                                                                 */
  26 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          */
  27 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     */
  28 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        */
  29 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        */
  30 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            */
  31 /* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    */
  32 /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT    */
  33 /* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;    */
  34 /* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)        */
  35 /* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN       */
  36 /* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR    */
  37 /* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  */
  38 /* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              */
  39 /* --------------------------------------------------------------  */
  40 /* PROLOG END TAG zYx                                              */
  41 #ifndef _FFT_1D_R2_H_
  42 #define _FFT_1D_R2_H_   1
  43
  44 #include "fft_1d.h"
  45
  46 /* fft_1d_r2
  47  * ---------
  48  * Performs a single precision, complex Fast Fourier Transform using
  49  * the DFT (Discrete Fourier Transform) with radix-2 decimation in time.
  50  * The input <in> is an array of complex numbers of length (1<<log2_size)
  51  * entries. The result is returned in the array of complex numbers specified
  52  * by <out>. Note: This routine can support an in-place transformation
  53  * by specifying <in> and <out> to be the same array.
  54  *
  55  * This implementation utilizes the Cooley-Tukey algorithm consisting
  56  * of <log2_size> stages. The basic operation is the butterfly.
  57  *
  58  *          p --------------------------> P = p + q*Wi
  59  *                        \      /
  60  *                         \    /
  61  *                          \  /
  62  *                           \/
  63  *                           /\
  64  *                          /  \
  65  *                         /    \
  66  *               ____     /      \
  67  *          q --| Wi |-----------------> Q = p - q*Wi
  68  *               ----
  69  *
  70  * This routine also requires pre-computed twiddle values, W. W is an
  71  * array of single precision complex numbers of length 1<<(log2_size-2)
  72  * and is computed as follows:
  73  *
  74  *      for (i=0; i<n/4; i++)
  75  *          W[i].real =  cos(i * 2*PI/n);
  76  *          W[i].imag = -sin(i * 2*PI/n);
  77  *      }
  78  *
  79  * This array actually only contains the first half of the twiddle
  80  * factors. Due for symmetry, the second half of the twiddle factors
  81  * are implied and equal:
  82  *
  83  *      for (i=0; i<n/4; i++)
  84  *          W[i+n/4].real =  W[i].imag =  sin(i * 2*PI/n);
  85  *          W[i+n/4].imag = -W[i].real = -cos(i * 2*PI/n);
  86  *      }
  87  *
  88  * Further symmetry allows one to generate the twiddle factor table
  89  * using half the number of trig computations as follows:
  90  *
  91  *      W[0].real = 1.0;
  92  *      W[0].imag = 0.0;
  93  *      for (i=1; i<n/4; i++)
  94  *          W[i].real =  cos(i * 2*PI/n);
  95  *          W[n/4 - i].imag = -W[i].real;
  96  *      }
  97  *
  98  * The complex numbers are packed into quadwords as follows:
  99  *
 100  *    quadword                        complex
 101  *  array element                   array elements
 102  *             -----------------------------------------------------
 103  *       i    |  real 2*i   |  imag 2*i  | real 2*i+1  | imag 2*i+1 |
 104  *             -----------------------------------------------------
 105  *
 106  */
 107
 108
 109 static __inline void _fft_1d_r2(vector float *out, vector float *in, vector float *W, int log2_size)
 110 {
 111   int i, j, k;
 112   int stage, offset;
 113   int i_rev;
 114   int n, n_2, n_4, n_8, n_16, n_3_16;
 115   int w_stride, w_2stride, w_3stride, w_4stride;
 116   int stride, stride_2, stride_4, stride_3_4;
 117   vector float *W0, *W1, *W2, *W3;
 118   vector float *re0, *re1, *re2, *re3;
 119   vector float *im0, *im1, *im2, *im3;
 120   vector float *in0, *in1, *in2, *in3, *in4, *in5, *in6, *in7;
 121   vector float *out0, *out1, *out2, *out3;
 122   vector float tmp0, tmp1;
 123   vector float w0_re, w0_im, w1_re, w1_im;
 124   vector float w0, w1, w2, w3;
 125   vector float src_lo0, src_lo1, src_lo2, src_lo3;
 126   vector float src_hi0, src_hi1, src_hi2, src_hi3;
 127   vector float dst_lo0, dst_lo1, dst_lo2, dst_lo3;
 128   vector float dst_hi0, dst_hi1, dst_hi2, dst_hi3;
 129   vector float out_re_lo0, out_re_lo1, out_re_lo2, out_re_lo3;
 130   vector float out_im_lo0, out_im_lo1, out_im_lo2, out_im_lo3;
 131   vector float out_re_hi0, out_re_hi1, out_re_hi2, out_re_hi3;
 132   vector float out_im_hi0, out_im_hi1, out_im_hi2, out_im_hi3;
 133   vector float re_lo0,  re_lo1,  re_lo2,  re_lo3;
 134   vector float im_lo0,  im_lo1,  im_lo2,  im_lo3;
 135   vector float re_hi0,  re_hi1,  re_hi2,  re_hi3;
 136   vector float im_hi0,  im_hi1,  im_hi2,  im_hi3;
 137   vector float pq_lo0,  pq_lo1,  pq_lo2,  pq_lo3;
 138   vector float pq_hi0,  pq_hi1,  pq_hi2,  pq_hi3;
 139   vector float re[MAX_FFT_1D_SIZE/4], im[MAX_FFT_1D_SIZE/4];    /* real & imaginary working arrays */
 140   vector float ppmm = (vector float) { 1.0f,  1.0f, -1.0f, -1.0f};
 141   vector float pmmp = (vector float) { 1.0f, -1.0f, -1.0f,  1.0f};
 142   vector unsigned char reverse;
 143   vector unsigned char shuf_lo = (vector unsigned char) {
 144                                              0,  1, 2, 3,  4, 5, 6, 7,
 145                                              16,17,18,19, 20,21,22,23};
 146   vector unsigned char shuf_hi = (vector unsigned char) {
 147                                              8,  9,10,11, 12,13,14,15,
 148                                              24,25,26,27, 28,29,30,31};
 149   vector unsigned char shuf_0202 = (vector unsigned char) {
 150                                                0, 1, 2, 3,  8, 9,10,11,
 151                                                0, 1, 2, 3,  8, 9,10,11};
 152   vector unsigned char shuf_1313 = (vector unsigned char) {
 153                                                4, 5, 6, 7, 12,13,14,15,
 154                                                4, 5, 6, 7, 12,13,14,15};
 155   vector unsigned char shuf_0303 = (vector unsigned char) {
 156                                                0, 1, 2, 3, 12,13,14,15,
 157                                                0, 1, 2, 3, 12,13,14,15};
 158   vector unsigned char shuf_1212 = (vector unsigned char) {
 159                                                4, 5, 6, 7,  8, 9,10,11,
 160                                                4, 5, 6, 7,  8, 9,10,11};
 161   vector unsigned char shuf_0415 = (vector unsigned char) {
 162                                                0, 1, 2, 3, 16,17,18,19,
 163                                                4, 5, 6, 7, 20,21,22,23};
 164   vector unsigned char shuf_2637 = (vector unsigned char) {
 165                                                8, 9,10,11, 24,25,26,27,
 166                                                12,13,14,15,28,29,30,31};
 167   vector unsigned char shuf_0246 = (vector unsigned char) {
 168                                                0, 1, 2, 3,  8, 9,10,11,
 169                                                16,17,18,19,24,25,26,27};
 170   vector unsigned char shuf_1357 = (vector unsigned char) {
 171                                                4, 5, 6, 7, 12,13,14,15,
 172                                                20,21,22,23,28,29,30,31};
 173
 174   n = 1 << log2_size;
 175   n_2  = n >> 1;
 176   n_4  = n >> 2;
 177   n_8  = n >> 3;
 178   n_16 = n >> 4;
 179
 180   n_3_16 = n_8 + n_16;
 181
 182   /* Compute a byte reverse shuffle pattern to be used to produce
 183    * an address bit swap.
 184    */
 185   reverse = spu_or(spu_slqwbyte(spu_splats((unsigned char)0x80), log2_size),
 186                    spu_rlmaskqwbyte(((vec_uchar16){15,14,13,12, 11,10,9,8,
 187                                                     7, 6, 5, 4,  3, 2,1,0}),
 188                                     log2_size-16));
 189
 190   /* Perform the first 3 stages of the FFT. These stages differs from
 191    * other stages in that the inputs are unscrambled and the data is
 192    * reformated into parallel arrays (ie, seperate real and imaginary
 193    * arrays). The term "unscramble" means the bit address reverse the
 194    * data array. In addition, the first three stages have simple twiddle
 195    * weighting factors.
 196    *            stage 1: (1, 0)
 197    *            stage 2: (1, 0) and (0, -1)
 198    *            stage 3: (1, 0), (0.707, -0.707), (0, -1), (-0.707, -0.707)
 199    *
 200    * The arrays are processed as two halves, simultaneously. The lo (first
 201    * half) and hi (second half). This is done because the scramble
 202    * shares source value between each half of the output arrays.
 203    */
 204   i = 0;
 205   i_rev = 0;
 206
 207   in0 = in;
 208   in1 = in + n_8;
 209   in2 = in + n_16;
 210   in3 = in + n_3_16;
 211
 212   in4 = in  + n_4;
 213   in5 = in1 + n_4;
 214   in6 = in2 + n_4;
 215   in7 = in3 + n_4;
 216
 217   re0 = re;
 218   re1 = re + n_8;
 219   im0 = im;
 220   im1 = im + n_8;
 221
 222   w0_re = (vector float) { 1.0f,  INV_SQRT_2,  0.0f, -INV_SQRT_2};
 223   w0_im = (vector float) { 0.0f, -INV_SQRT_2, -1.0f, -INV_SQRT_2};
 224
 225   do {
 226     src_lo0 = in0[i_rev];
 227     src_lo1 = in1[i_rev];
 228     src_lo2 = in2[i_rev];
 229     src_lo3 = in3[i_rev];
 230
 231     src_hi0 = in4[i_rev];
 232     src_hi1 = in5[i_rev];
 233     src_hi2 = in6[i_rev];
 234     src_hi3 = in7[i_rev];
 235
 236     /* Perform scramble.
 237      */
 238     dst_lo0 = spu_shuffle(src_lo0, src_hi0, shuf_lo);
 239     dst_hi0 = spu_shuffle(src_lo0, src_hi0, shuf_hi);
 240     dst_lo1 = spu_shuffle(src_lo1, src_hi1, shuf_lo);
 241     dst_hi1 = spu_shuffle(src_lo1, src_hi1, shuf_hi);
 242     dst_lo2 = spu_shuffle(src_lo2, src_hi2, shuf_lo);
 243     dst_hi2 = spu_shuffle(src_lo2, src_hi2, shuf_hi);
 244     dst_lo3 = spu_shuffle(src_lo3, src_hi3, shuf_lo);
 245     dst_hi3 = spu_shuffle(src_lo3, src_hi3, shuf_hi);
 246
 247     /* Perform the stage 1 butterfly. The multiplier constant, ppmm,
 248      * is used to control the sign of the operands since a single
 249      * quadword contains both of P and Q valule of the butterfly.
 250      */
 251     pq_lo0 = spu_madd(ppmm, dst_lo0, spu_rlqwbyte(dst_lo0, 8));
 252     pq_hi0 = spu_madd(ppmm, dst_hi0, spu_rlqwbyte(dst_hi0, 8));
 253     pq_lo1 = spu_madd(ppmm, dst_lo1, spu_rlqwbyte(dst_lo1, 8));
 254     pq_hi1 = spu_madd(ppmm, dst_hi1, spu_rlqwbyte(dst_hi1, 8));
 255     pq_lo2 = spu_madd(ppmm, dst_lo2, spu_rlqwbyte(dst_lo2, 8));
 256     pq_hi2 = spu_madd(ppmm, dst_hi2, spu_rlqwbyte(dst_hi2, 8));
 257     pq_lo3 = spu_madd(ppmm, dst_lo3, spu_rlqwbyte(dst_lo3, 8));
 258     pq_hi3 = spu_madd(ppmm, dst_hi3, spu_rlqwbyte(dst_hi3, 8));
 259
 260     /* Perfrom the stage 2 butterfly. For this stage, the
 261      * inputs pq are still interleaved (p.real, p.imag, q.real,
 262      * q.imag), so we must first re-order the data into
 263      * parallel arrays as well as perform the reorder
 264      * associated with the twiddle W[n/4], which equals
 265      * (0, -1).
 266      *
 267      *  ie. (A, B) * (0, -1) => (B, -A)
 268      */
 269     re_lo0 = spu_madd(ppmm,
 270                       spu_shuffle(pq_lo1, pq_lo1, shuf_0303),
 271                       spu_shuffle(pq_lo0, pq_lo0, shuf_0202));
 272     im_lo0 = spu_madd(pmmp,
 273                       spu_shuffle(pq_lo1, pq_lo1, shuf_1212),
 274                       spu_shuffle(pq_lo0, pq_lo0, shuf_1313));
 275
 276     re_lo1 = spu_madd(ppmm,
 277                       spu_shuffle(pq_lo3, pq_lo3, shuf_0303),
 278                       spu_shuffle(pq_lo2, pq_lo2, shuf_0202));
 279     im_lo1 = spu_madd(pmmp,
 280                       spu_shuffle(pq_lo3, pq_lo3, shuf_1212),
 281                       spu_shuffle(pq_lo2, pq_lo2, shuf_1313));
 282
 283
 284     re_hi0 = spu_madd(ppmm,
 285                       spu_shuffle(pq_hi1, pq_hi1, shuf_0303),
 286                       spu_shuffle(pq_hi0, pq_hi0, shuf_0202));
 287     im_hi0 = spu_madd(pmmp,
 288                        spu_shuffle(pq_hi1, pq_hi1, shuf_1212),
 289                        spu_shuffle(pq_hi0, pq_hi0, shuf_1313));
 290
 291     re_hi1 = spu_madd(ppmm,
 292                       spu_shuffle(pq_hi3, pq_hi3, shuf_0303),
 293                       spu_shuffle(pq_hi2, pq_hi2, shuf_0202));
 294     im_hi1 = spu_madd(pmmp,
 295                       spu_shuffle(pq_hi3, pq_hi3, shuf_1212),
 296                       spu_shuffle(pq_hi2, pq_hi2, shuf_1313));
 297
 298
 299     /* Perform stage 3 butterfly.
 300      */
 301     FFT_1D_BUTTERFLY(re0[0], im0[0], re0[1], im0[1], re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
 302     FFT_1D_BUTTERFLY(re1[0], im1[0], re1[1], im1[1], re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
 303
 304     re0 += 2;
 305     re1 += 2;
 306     im0 += 2;
 307     im1 += 2;
 308
 309     i += 8;
 310     i_rev = BIT_SWAP(i, reverse) / 2;
 311   } while (i < n_2);
 312
 313   /* Process stages 4 to log2_size-2
 314    */
 315   for (stage=4, stride=4; stage<log2_size-1; stage++, stride += stride) {
 316     w_stride  = n_2 >> stage;
 317     w_2stride = n   >> stage;
 318     w_3stride = w_stride +  w_2stride;
 319     w_4stride = w_2stride + w_2stride;
 320
 321     W0 = W;
 322     W1 = W + w_stride;
 323     W2 = W + w_2stride;
 324     W3 = W + w_3stride;
 325
 326     stride_2 = stride >> 1;
 327     stride_4 = stride >> 2;
 328     stride_3_4 = stride_2 + stride_4;
 329
 330     re0 = re;              im0 = im;
 331     re1 = re + stride_2;   im1 = im + stride_2;
 332     re2 = re + stride_4;   im2 = im + stride_4;
 333     re3 = re + stride_3_4; im3 = im + stride_3_4;
 334
 335     for (i=0, offset=0; i<stride_4; i++, offset += w_4stride) {
 336       /* Compute the twiddle factors
 337        */
 338       w0 = W0[offset];
 339       w1 = W1[offset];
 340       w2 = W2[offset];
 341       w3 = W3[offset];
 342
 343       tmp0 = spu_shuffle(w0, w2, shuf_0415);
 344       tmp1 = spu_shuffle(w1, w3, shuf_0415);
 345
 346       w0_re = spu_shuffle(tmp0, tmp1, shuf_0415);
 347       w0_im = spu_shuffle(tmp0, tmp1, shuf_2637);
 348
 349       j = i;
 350       k = i + stride;
 351       do {
 352         re_lo0 = re0[j]; im_lo0 = im0[j];
 353         re_lo1 = re1[j]; im_lo1 = im1[j];
 354
 355         re_hi0 = re2[j]; im_hi0 = im2[j];
 356         re_hi1 = re3[j]; im_hi1 = im3[j];
 357
 358         re_lo2 = re0[k]; im_lo2 = im0[k];
 359         re_lo3 = re1[k]; im_lo3 = im1[k];
 360
 361         re_hi2 = re2[k]; im_hi2 = im2[k];
 362         re_hi3 = re3[k]; im_hi3 = im3[k];
 363
 364         FFT_1D_BUTTERFLY   (re0[j], im0[j], re1[j], im1[j], re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
 365         FFT_1D_BUTTERFLY_HI(re2[j], im2[j], re3[j], im3[j], re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
 366
 367         FFT_1D_BUTTERFLY   (re0[k], im0[k], re1[k], im1[k], re_lo2, im_lo2, re_lo3, im_lo3, w0_re, w0_im);
 368         FFT_1D_BUTTERFLY_HI(re2[k], im2[k], re3[k], im3[k], re_hi2, im_hi2, re_hi3, im_hi3, w0_re, w0_im);
 369
 370         j += 2 * stride;
 371         k += 2 * stride;
 372       } while (j < n_4);
 373     }
 374   }
 375
 376   /* Process stage log2_size-1. This is identical to the stage processing above
 377    * except for this stage the inner loop is only executed once so it is removed
 378    * entirely.
 379    */
 380   w_stride  = n_2 >> stage;
 381   w_2stride = n   >> stage;
 382   w_3stride = w_stride +  w_2stride;
 383   w_4stride = w_2stride + w_2stride;
 384
 385   stride_2 = stride >> 1;
 386   stride_4 = stride >> 2;
 387
 388   stride_3_4 = stride_2 + stride_4;
 389
 390   re0 = re;              im0 = im;
 391   re1 = re + stride_2;   im1 = im + stride_2;
 392   re2 = re + stride_4;   im2 = im + stride_4;
 393   re3 = re + stride_3_4; im3 = im + stride_3_4;
 394
 395   for (i=0, offset=0; i<stride_4; i++, offset += w_4stride) {
 396     /* Compute the twiddle factors
 397      */
 398     w0 = W[offset];
 399     w1 = W[offset + w_stride];
 400     w2 = W[offset + w_2stride];
 401     w3 = W[offset + w_3stride];
 402
 403     tmp0 = spu_shuffle(w0, w2, shuf_0415);
 404     tmp1 = spu_shuffle(w1, w3, shuf_0415);
 405
 406     w0_re = spu_shuffle(tmp0, tmp1, shuf_0415);
 407     w0_im = spu_shuffle(tmp0, tmp1, shuf_2637);
 408
 409     j = i;
 410     k = i + stride;
 411
 412     re_lo0 = re0[j]; im_lo0 = im0[j];
 413     re_lo1 = re1[j]; im_lo1 = im1[j];
 414
 415     re_hi0 = re2[j]; im_hi0 = im2[j];
 416     re_hi1 = re3[j]; im_hi1 = im3[j];
 417
 418     re_lo2 = re0[k]; im_lo2 = im0[k];
 419     re_lo3 = re1[k]; im_lo3 = im1[k];
 420
 421     re_hi2 = re2[k]; im_hi2 = im2[k];
 422     re_hi3 = re3[k]; im_hi3 = im3[k];
 423
 424     FFT_1D_BUTTERFLY   (re0[j], im0[j], re1[j], im1[j], re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
 425     FFT_1D_BUTTERFLY_HI(re2[j], im2[j], re3[j], im3[j], re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
 426
 427     FFT_1D_BUTTERFLY   (re0[k], im0[k], re1[k], im1[k], re_lo2, im_lo2, re_lo3, im_lo3, w0_re, w0_im);
 428     FFT_1D_BUTTERFLY_HI(re2[k], im2[k], re3[k], im3[k], re_hi2, im_hi2, re_hi3, im_hi3, w0_re, w0_im);
 429   }
 430
 431
 432   /* Process the final stage (stage log2_size). For this stage,
 433    * reformat the data from parallel arrays back into
 434    * interleaved arrays,storing the result into <in>.
 435    *
 436    * This loop has been manually unrolled by 2 to improve
 437    * dual issue rates and reduce stalls. This unrolling
 438    * forces a minimum FFT size of 32.
 439    */
 440   re0 = re;
 441   re1 = re + n_8;
 442   re2 = re + n_16;
 443   re3 = re + n_3_16;
 444
 445   im0 = im;
 446   im1 = im + n_8;
 447   im2 = im + n_16;
 448   im3 = im + n_3_16;
 449
 450   out0 = out;
 451   out1 = out + n_4;
 452   out2 = out + n_8;
 453   out3 = out1 + n_8;
 454
 455   i = n_16;
 456
 457   do {
 458     /* Fetch the twiddle factors
 459      */
 460     w0 = W[0];
 461     w1 = W[1];
 462     w2 = W[2];
 463     w3 = W[3];
 464
 465     W += 4;
 466
 467     w0_re = spu_shuffle(w0, w1, shuf_0246);
 468     w0_im = spu_shuffle(w0, w1, shuf_1357);
 469     w1_re = spu_shuffle(w2, w3, shuf_0246);
 470     w1_im = spu_shuffle(w2, w3, shuf_1357);
 471
 472     /* Fetch the butterfly inputs, reals and imaginaries
 473      */
 474     re_lo0 = re0[0]; im_lo0 = im0[0];
 475     re_lo1 = re1[0]; im_lo1 = im1[0];
 476     re_lo2 = re0[1]; im_lo2 = im0[1];
 477     re_lo3 = re1[1]; im_lo3 = im1[1];
 478
 479     re_hi0 = re2[0]; im_hi0 = im2[0];
 480     re_hi1 = re3[0]; im_hi1 = im3[0];
 481     re_hi2 = re2[1]; im_hi2 = im2[1];
 482     re_hi3 = re3[1]; im_hi3 = im3[1];
 483
 484     re0 += 2; im0 += 2;
 485     re1 += 2; im1 += 2;
 486     re2 += 2; im2 += 2;
 487     re3 += 2; im3 += 2;
 488
 489     /* Perform the butterflys
 490      */
 491     FFT_1D_BUTTERFLY   (out_re_lo0, out_im_lo0, out_re_lo1, out_im_lo1, re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
 492     FFT_1D_BUTTERFLY   (out_re_lo2, out_im_lo2, out_re_lo3, out_im_lo3, re_lo2, im_lo2, re_lo3, im_lo3, w1_re, w1_im);
 493
 494     FFT_1D_BUTTERFLY_HI(out_re_hi0, out_im_hi0, out_re_hi1, out_im_hi1, re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
 495     FFT_1D_BUTTERFLY_HI(out_re_hi2, out_im_hi2, out_re_hi3, out_im_hi3, re_hi2, im_hi2, re_hi3, im_hi3, w1_re, w1_im);
 496
 497     /* Interleave the results and store them into the output buffers (ie,
 498      * the original input buffers.
 499      */
 500     out0[0] = spu_shuffle(out_re_lo0, out_im_lo0, shuf_0415);
 501     out0[1] = spu_shuffle(out_re_lo0, out_im_lo0, shuf_2637);
 502     out0[2] = spu_shuffle(out_re_lo2, out_im_lo2, shuf_0415);
 503     out0[3] = spu_shuffle(out_re_lo2, out_im_lo2, shuf_2637);
 504
 505     out1[0] = spu_shuffle(out_re_lo1, out_im_lo1, shuf_0415);
 506     out1[1] = spu_shuffle(out_re_lo1, out_im_lo1, shuf_2637);
 507     out1[2] = spu_shuffle(out_re_lo3, out_im_lo3, shuf_0415);
 508     out1[3] = spu_shuffle(out_re_lo3, out_im_lo3, shuf_2637);
 509
 510     out2[0] = spu_shuffle(out_re_hi0, out_im_hi0, shuf_0415);
 511     out2[1] = spu_shuffle(out_re_hi0, out_im_hi0, shuf_2637);
 512     out2[2] = spu_shuffle(out_re_hi2, out_im_hi2, shuf_0415);
 513     out2[3] = spu_shuffle(out_re_hi2, out_im_hi2, shuf_2637);
 514
 515     out3[0] = spu_shuffle(out_re_hi1, out_im_hi1, shuf_0415);
 516     out3[1] = spu_shuffle(out_re_hi1, out_im_hi1, shuf_2637);
 517     out3[2] = spu_shuffle(out_re_hi3, out_im_hi3, shuf_0415);
 518     out3[3] = spu_shuffle(out_re_hi3, out_im_hi3, shuf_2637);
 519
 520     out0 += 4;
 521     out1 += 4;
 522     out2 += 4;
 523     out3 += 4;
 524
 525     i -= 2;
 526   } while (i);
 527 }
 528
 529 #endif /* _FFT_1D_R2_H_ */