git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_fir_fast_q31.c
   9 *
  10 * Description:  Processing function for the Q31 Fast FIR filter.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated.
  28 *
  29 * Version 0.0.9  2010/08/27
  30 *    Initial version
  31 * -------------------------------------------------------------------- */
  32
  33 #include "arm_math.h"
  34
  35 /**
  36  * @ingroup groupFilters
  37  */
  38
  39 /**
  40  * @addtogroup FIR
  41  * @{
  42  */
  43
  44 /**
  45  * @param[in] *S points to an instance of the Q31 structure.
  46  * @param[in] *pSrc points to the block of input data.
  47  * @param[out] *pDst points to the block output data.
  48  * @param[in] blockSize number of samples to process per call.
  49  * @return none.
  50  *
  51  * <b>Scaling and Overflow Behavior:</b>
  52  *
  53  * \par
  54  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
  55  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
  56  * These intermediate results are added to a 2.30 accumulator.
  57  * Finally, the accumulator is saturated and converted to a 1.31 result.
  58  * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
  59  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
  60  *
  61  * \par
  62  * Refer to the function <code>arm_fir_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.  Both the slow and the fast versions use the same instance structure.
  63  * Use the function <code>arm_fir_init_q31()</code> to initialize the filter structure.
  64  */
  65
  66 void arm_fir_fast_q31(
  67   const arm_fir_instance_q31 * S,
  68   q31_t * pSrc,
  69   q31_t * pDst,
  70   uint32_t blockSize)
  71 {
  72   q31_t *pState = S->pState;                     /* State pointer */
  73   q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
  74   q31_t *pStateCurnt;                            /* Points to the current sample of the state */
  75   q31_t x0, x1, x2, x3;                          /* Temporary variables to hold state */
  76   q31_t c0;                                      /* Temporary variable to hold coefficient value */
  77   q31_t *px;                                     /* Temporary pointer for state */
  78   q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
  79   q63_t acc0, acc1, acc2, acc3;                  /* Accumulators */
  80   uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
  81   uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
  82
  83   /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
  84   /* pStateCurnt points to the location where the new input data should be written */
  85   pStateCurnt = &(S->pState[(numTaps - 1u)]);
  86
  87   /* Apply loop unrolling and compute 4 output values simultaneously.
  88    * The variables acc0 ... acc3 hold output values that are being computed:
  89    *
  90    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
  91    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
  92    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
  93    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
  94    */
  95   blkCnt = blockSize >> 2;
  96
  97   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
  98    ** a second loop below computes the remaining 1 to 3 samples. */
  99   while(blkCnt > 0u)
 100   {
 101     /* Copy four new input samples into the state buffer */
 102     *pStateCurnt++ = *pSrc++;
 103     *pStateCurnt++ = *pSrc++;
 104     *pStateCurnt++ = *pSrc++;
 105     *pStateCurnt++ = *pSrc++;
 106
 107     /* Set all accumulators to zero */
 108     acc0 = 0;
 109     acc1 = 0;
 110     acc2 = 0;
 111     acc3 = 0;
 112
 113     /* Initialize state pointer */
 114     px = pState;
 115
 116     /* Initialize coefficient pointer */
 117     pb = pCoeffs;
 118
 119     /* Read the first three samples from the state buffer:
 120      *  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
 121     x0 = *(px++);
 122     x1 = *(px++);
 123     x2 = *(px++);
 124
 125     /* Loop unrolling.  Process 4 taps at a time. */
 126     tapCnt = numTaps >> 2;
 127     i = tapCnt;
 128
 129     while(i > 0u)
 130     {
 131       /* Read the b[numTaps] coefficient */
 132       c0 = *(pb++);
 133
 134       /* Read x[n-numTaps-3] sample */
 135       x3 = *(px++);
 136
 137       /* acc0 +=  b[numTaps] * x[n-numTaps] */
 138       acc0 = (q31_t) ((((q63_t) x0 * c0) + (acc0 << 32)) >> 32);
 139
 140       /* acc1 +=  b[numTaps] * x[n-numTaps-1] */
 141       acc1 = (q31_t) ((((q63_t) x1 * c0) + (acc1 << 32)) >> 32);
 142
 143       /* acc2 +=  b[numTaps] * x[n-numTaps-2] */
 144       acc2 = (q31_t) ((((q63_t) x2 * c0) + (acc2 << 32)) >> 32);
 145
 146       /* acc3 +=  b[numTaps] * x[n-numTaps-3] */
 147       acc3 = (q31_t) ((((q63_t) x3 * c0) + (acc3 << 32)) >> 32);
 148
 149       /* Read the b[numTaps-1] coefficient */
 150       c0 = *(pb++);
 151
 152       /* Read x[n-numTaps-4] sample */
 153       x0 = *(px++);
 154
 155       /* Perform the multiply-accumulates */
 156       acc0 = (q31_t) ((((q63_t) x1 * c0) + (acc0 << 32)) >> 32);
 157       acc1 = (q31_t) ((((q63_t) x2 * c0) + (acc1 << 32)) >> 32);
 158       acc2 = (q31_t) ((((q63_t) x3 * c0) + (acc2 << 32)) >> 32);
 159       acc3 = (q31_t) ((((q63_t) x0 * c0) + (acc3 << 32)) >> 32);
 160
 161       /* Read the b[numTaps-2] coefficient */
 162       c0 = *(pb++);
 163
 164       /* Read x[n-numTaps-5] sample */
 165       x1 = *(px++);
 166
 167       /* Perform the multiply-accumulates */
 168       acc0 = (q31_t) ((((q63_t) x2 * c0) + (acc0 << 32)) >> 32);
 169       acc1 = (q31_t) ((((q63_t) x3 * c0) + (acc1 << 32)) >> 32);
 170       acc2 = (q31_t) ((((q63_t) x0 * c0) + (acc2 << 32)) >> 32);
 171       acc3 = (q31_t) ((((q63_t) x1 * c0) + (acc3 << 32)) >> 32);
 172
 173       /* Read the b[numTaps-3] coefficients */
 174       c0 = *(pb++);
 175
 176       /* Read x[n-numTaps-6] sample */
 177       x2 = *(px++);
 178
 179       /* Perform the multiply-accumulates */
 180       acc0 = (q31_t) ((((q63_t) x3 * c0) + (acc0 << 32)) >> 32);
 181       acc1 = (q31_t) ((((q63_t) x0 * c0) + (acc1 << 32)) >> 32);
 182       acc2 = (q31_t) ((((q63_t) x1 * c0) + (acc2 << 32)) >> 32);
 183       acc3 = (q31_t) ((((q63_t) x2 * c0) + (acc3 << 32)) >> 32);
 184       i--;
 185     }
 186
 187     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
 188
 189     i = numTaps - (tapCnt * 4u);
 190     while(i > 0u)
 191     {
 192       /* Read coefficients */
 193       c0 = *(pb++);
 194
 195       /* Fetch 1 state variable */
 196       x3 = *(px++);
 197
 198       /* Perform the multiply-accumulates */
 199       acc0 = (q31_t) ((((q63_t) x0 * c0) + (acc0 << 32)) >> 32);
 200       acc1 = (q31_t) ((((q63_t) x1 * c0) + (acc1 << 32)) >> 32);
 201       acc2 = (q31_t) ((((q63_t) x2 * c0) + (acc2 << 32)) >> 32);
 202       acc3 = (q31_t) ((((q63_t) x3 * c0) + (acc3 << 32)) >> 32);
 203
 204       /* Reuse the present sample states for next sample */
 205       x0 = x1;
 206       x1 = x2;
 207       x2 = x3;
 208
 209       /* Decrement the loop counter */
 210       i--;
 211     }
 212
 213     /* Advance the state pointer by 4 to process the next group of 4 samples */
 214     pState = pState + 4;
 215
 216     /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.31
 217      ** Then store the 4 outputs in the destination buffer. */
 218     *pDst++ = (q31_t) (acc0 << 1);
 219     *pDst++ = (q31_t) (acc1 << 1);
 220     *pDst++ = (q31_t) (acc2 << 1);
 221     *pDst++ = (q31_t) (acc3 << 1);
 222
 223     /* Decrement the samples loop counter */
 224     blkCnt--;
 225   }
 226
 227
 228   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
 229    ** No loop unrolling is used. */
 230   blkCnt = blockSize % 4u;
 231
 232   while(blkCnt > 0u)
 233   {
 234     /* Copy one sample at a time into state buffer */
 235     *pStateCurnt++ = *pSrc++;
 236
 237     /* Set the accumulator to zero */
 238     acc0 = 0;
 239
 240     /* Initialize state pointer */
 241     px = pState;
 242
 243     /* Initialize Coefficient pointer */
 244     pb = (pCoeffs);
 245
 246     i = numTaps;
 247
 248     /* Perform the multiply-accumulates */
 249     do
 250     {
 251       acc0 = (q31_t) ((((q63_t) * (px++) * (*(pb++))) + (acc0 << 32)) >> 32);
 252       i--;
 253     } while(i > 0u);
 254
 255     /* The result is in 2.30 format.  Convert to 1.31
 256      ** Then store the output in the destination buffer. */
 257     *pDst++ = (q31_t) (acc0 << 1);
 258
 259     /* Advance state pointer by 1 for the next sample */
 260     pState = pState + 1;
 261
 262     /* Decrement the samples loop counter */
 263     blkCnt--;
 264   }
 265
 266   /* Processing is complete.
 267    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
 268    ** This prepares the state buffer for the next function call. */
 269
 270   /* Points to the start of the state buffer */
 271   pStateCurnt = S->pState;
 272
 273   tapCnt = (numTaps - 1u) >> 2u;
 274
 275   /* copy data */
 276   while(tapCnt > 0u)
 277   {
 278     *pStateCurnt++ = *pState++;
 279     *pStateCurnt++ = *pState++;
 280     *pStateCurnt++ = *pState++;
 281     *pStateCurnt++ = *pState++;
 282
 283     /* Decrement the loop counter */
 284     tapCnt--;
 285   }
 286
 287   /* Calculate remaining number of copies */
 288   tapCnt = (numTaps - 1u) % 0x4u;
 289
 290   /* Copy the remaining q31_t data */
 291   while(tapCnt > 0u)
 292   {
 293     *pStateCurnt++ = *pState++;
 294
 295     /* Decrement the loop counter */
 296     tapCnt--;
 297   }
 298
 299 }
 300
 301 /**
 302  * @} end of FIR group
 303  */