3 * Copyright 2010 Free Software Foundation, Inc.
5 * This file is part of GNU Radio
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
27 #include <gri_fft_filter_fff_sse.h>
32 #include <xmmintrin.h>
35 gri_fft_filter_fff_sse::gri_fft_filter_fff_sse (int decimation,
36 const std::vector<float> &taps)
37 : d_fftsize(-1), d_decimation(decimation), d_fwdfft(0), d_invfft(0)
39 d_xformed_taps = (gr_complex*)fftwf_malloc(1*sizeof(gr_complex));
43 gri_fft_filter_fff_sse::~gri_fft_filter_fff_sse ()
45 fftwf_free(d_xformed_taps);
51 * determines d_ntaps, d_nsamples, d_fftsize, d_xformed_taps
54 gri_fft_filter_fff_sse::set_taps (const std::vector<float> &taps)
57 compute_sizes(taps.size());
59 d_tail.resize(tailsize());
60 for (i = 0; i < tailsize(); i++)
63 float *in = d_fwdfft->get_inbuf();
64 gr_complex *out = d_fwdfft->get_outbuf();
66 float scale = 1.0 / d_fftsize;
68 // Compute forward xform of taps.
69 // Copy taps into first ntaps slots, then pad with zeros
70 for (i = 0; i < d_ntaps; i++)
71 in[i] = taps[i] * scale;
73 for (; i < d_fftsize; i++)
76 d_fwdfft->execute(); // do the xform
78 // now copy output to d_xformed_taps
79 for (i = 0; i < d_fftsize/2+1; i++)
80 d_xformed_taps[i] = out[i];
85 // determine and set d_ntaps, d_nsamples, d_fftsize
88 gri_fft_filter_fff_sse::compute_sizes(int ntaps)
90 int old_fftsize = d_fftsize;
92 d_fftsize = (int) (2 * pow(2.0, ceil(log(ntaps) / log(2))));
93 d_nsamples = d_fftsize - d_ntaps + 1;
96 fprintf(stderr, "gri_fft_filter_fff_sse: ntaps = %d, fftsize = %d, nsamples = %d\n",
97 d_ntaps, d_fftsize, d_nsamples);
99 assert(d_fftsize == d_ntaps + d_nsamples -1 );
101 if (d_fftsize != old_fftsize){ // compute new plans
104 d_fwdfft = new gri_fft_real_fwd(d_fftsize);
105 d_invfft = new gri_fft_real_rev(d_fftsize);
106 //d_xformed_taps.resize(d_fftsize/2+1);
108 fftwf_free(d_xformed_taps);
109 d_xformed_taps = (gr_complex*)fftwf_malloc((d_fftsize/2+1)*sizeof(gr_complex));
114 gri_fft_filter_fff_sse::filter (int nitems, const float *input, float *output)
118 int ninput_items = nitems * d_decimation;
120 for (int i = 0; i < ninput_items; i += d_nsamples){
122 memcpy(d_fwdfft->get_inbuf(), &input[i], d_nsamples * sizeof(float));
124 for (j = d_nsamples; j < d_fftsize; j++)
125 d_fwdfft->get_inbuf()[j] = 0;
127 d_fwdfft->execute(); // compute fwd xform
129 float *a = (float*)(d_fwdfft->get_outbuf());
130 float *b = (float*)(&d_xformed_taps[0]);
131 float *c = (float*)(d_invfft->get_inbuf());
133 __m128 x0, x1, x2, t0, t1, m;
134 m = _mm_set_ps(-1, 1, -1, 1);
135 for (j = 0; j < d_fftsize; j+=4) { // filter in the freq domain
136 x0 = _mm_load_ps(&a[j]);
137 t0 = _mm_load_ps(&b[j]);
139 t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 1, 1));
140 t0 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 0, 0));
141 t1 = _mm_mul_ps(t1, m);
143 x1 = _mm_mul_ps(x0, t0);
144 x2 = _mm_mul_ps(x0, t1);
146 x2 = _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(2, 3, 0, 1));
147 x2 = _mm_add_ps(x1, x2);
149 _mm_store_ps(&c[j], x2);
152 // Finish off the last one; do the complex multiply as floats
154 c[j] = (a[j] * b[j]) - (a[j+1] * b[j+1]);
155 c[j+1] = (a[j] * b[j+1]) + (a[j+1] * b[j]);
157 d_invfft->execute(); // compute inv xform
159 // add in the overlapping tail
161 for (j = 0; j < tailsize(); j++)
162 d_invfft->get_outbuf()[j] += d_tail[j];
164 // copy nsamples to output
166 //memcpy(out, d_invfft->get_outbuf(), d_nsamples * sizeof(float));
170 while (j < d_nsamples) {
171 *output++ = d_invfft->get_outbuf()[j];
174 dec_ctr = (j - d_nsamples);
177 memcpy(&d_tail[0], d_invfft->get_outbuf() + d_nsamples,
178 tailsize() * sizeof(float));
181 assert(dec_ctr == 0);