Merge branch 'upstream' into dfsg-orig

[debian/gnuradio] / gnuradio-core / src / lib / filter / gri_fft_filter_fff_sse.cc
diff --git a/gnuradio-core/src/lib/filter/gri_fft_filter_fff_sse.cc b/gnuradio-core/src/lib/filter/gri_fft_filter_fff_sse.cc

new file mode 100644 (file)

index 0000000..2680e65
--- /dev/null
+++ b/gnuradio-core/src/lib/filter/gri_fft_filter_fff_sse.cc
@@ -0,0 +1,184 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2010 Free Software Foundation, Inc.
+ * 
+ * This file is part of GNU Radio
+ * 
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ * 
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <gri_fft_filter_fff_sse.h>
+#include <gri_fft.h>
+#include <assert.h>
+#include <stdexcept>
+#include <cstdio>
+#include <xmmintrin.h>
+#include <fftw3.h>
+
+gri_fft_filter_fff_sse::gri_fft_filter_fff_sse (int decimation, 
+                                                       const std::vector<float> &taps)
+  : d_fftsize(-1), d_decimation(decimation), d_fwdfft(0), d_invfft(0)
+{
+  d_xformed_taps = (gr_complex*)fftwf_malloc(1*sizeof(gr_complex));
+  set_taps(taps);
+}
+
+gri_fft_filter_fff_sse::~gri_fft_filter_fff_sse ()
+{
+  fftwf_free(d_xformed_taps);
+  delete d_fwdfft;
+  delete d_invfft;
+}
+
+/*
+ * determines d_ntaps, d_nsamples, d_fftsize, d_xformed_taps
+ */
+int
+gri_fft_filter_fff_sse::set_taps (const std::vector<float> &taps)
+{
+  int i = 0;
+  compute_sizes(taps.size());
+
+  d_tail.resize(tailsize());
+  for (i = 0; i < tailsize(); i++)
+    d_tail[i] = 0;
+
+  float *in = d_fwdfft->get_inbuf();
+  gr_complex *out = d_fwdfft->get_outbuf();
+
+  float scale = 1.0 / d_fftsize;
+  
+  // Compute forward xform of taps.
+  // Copy taps into first ntaps slots, then pad with zeros
+  for (i = 0; i < d_ntaps; i++)
+    in[i] = taps[i] * scale;
+
+  for (; i < d_fftsize; i++)
+    in[i] = 0;
+
+  d_fwdfft->execute();         // do the xform
+
+  // now copy output to d_xformed_taps
+  for (i = 0; i < d_fftsize/2+1; i++)
+    d_xformed_taps[i] = out[i];
+  
+  return d_nsamples;
+}
+
+// determine and set d_ntaps, d_nsamples, d_fftsize
+
+void
+gri_fft_filter_fff_sse::compute_sizes(int ntaps)
+{
+  int old_fftsize = d_fftsize;
+  d_ntaps = ntaps;
+  d_fftsize = (int) (2 * pow(2.0, ceil(log(ntaps) / log(2))));
+  d_nsamples = d_fftsize - d_ntaps + 1;
+
+  if (0)
+    fprintf(stderr, "gri_fft_filter_fff_sse: ntaps = %d, fftsize = %d, nsamples = %d\n",
+           d_ntaps, d_fftsize, d_nsamples);
+
+  assert(d_fftsize == d_ntaps + d_nsamples -1 );
+
+  if (d_fftsize != old_fftsize){       // compute new plans
+    delete d_fwdfft;
+    delete d_invfft;
+    d_fwdfft = new gri_fft_real_fwd(d_fftsize);
+    d_invfft = new gri_fft_real_rev(d_fftsize);
+    //d_xformed_taps.resize(d_fftsize/2+1);
+
+    fftwf_free(d_xformed_taps);
+    d_xformed_taps = (gr_complex*)fftwf_malloc((d_fftsize/2+1)*sizeof(gr_complex));
+  }
+}
+
+int
+gri_fft_filter_fff_sse::filter (int nitems, const float *input, float *output)
+{
+  int dec_ctr = 0;
+  int j = 0;
+  int ninput_items = nitems * d_decimation;
+
+  for (int i = 0; i < ninput_items; i += d_nsamples){
+    
+    memcpy(d_fwdfft->get_inbuf(), &input[i], d_nsamples * sizeof(float));
+
+    for (j = d_nsamples; j < d_fftsize; j++)
+      d_fwdfft->get_inbuf()[j] = 0;
+
+    d_fwdfft->execute();       // compute fwd xform
+
+    float *a = (float*)(d_fwdfft->get_outbuf());
+    float *b = (float*)(&d_xformed_taps[0]);
+    float *c = (float*)(d_invfft->get_inbuf());
+
+    __m128 x0, x1, x2, t0, t1, m;
+    m = _mm_set_ps(-1, 1, -1, 1);
+    for (j = 0; j < d_fftsize; j+=4) { // filter in the freq domain
+      x0 = _mm_load_ps(&a[j]);
+      t0 = _mm_load_ps(&b[j]);
+      
+      t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 1, 1));
+      t0 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 0, 0));
+      t1 = _mm_mul_ps(t1, m);
+
+      x1 = _mm_mul_ps(x0, t0);
+      x2 = _mm_mul_ps(x0, t1);
+
+      x2 = _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(2, 3, 0, 1));
+      x2 = _mm_add_ps(x1, x2);
+
+      _mm_store_ps(&c[j], x2);
+    }
+    
+    // Finish off the last one; do the complex multiply as floats
+    j = d_fftsize/2;
+    c[j] = (a[j] * b[j]) - (a[j+1] * b[j+1]);
+    c[j+1] = (a[j] * b[j+1]) + (a[j+1] * b[j]);
+
+    d_invfft->execute();       // compute inv xform
+
+    // add in the overlapping tail
+
+    for (j = 0; j < tailsize(); j++)
+      d_invfft->get_outbuf()[j] += d_tail[j];
+
+    // copy nsamples to output
+
+    //memcpy(out, d_invfft->get_outbuf(), d_nsamples * sizeof(float));
+    //out += d_nsamples;
+
+    j = dec_ctr;
+    while (j < d_nsamples) {
+      *output++ = d_invfft->get_outbuf()[j];
+      j += d_decimation;
+    }
+    dec_ctr = (j - d_nsamples);
+
+    // stash the tail
+    memcpy(&d_tail[0], d_invfft->get_outbuf() + d_nsamples,
+          tailsize() * sizeof(float));
+  }
+
+  assert(dec_ctr == 0);
+
+  return nitems;
+}