git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_fir_q31.c
   9 *
  10 * Description:  Q31 FIR filter processing function.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated.
  28 *
  29 * Version 0.0.5  2010/04/26
  30 *        incorporated review comments and updated with latest CMSIS layer
  31 *
  32 * Version 0.0.3  2010/03/10
  33 *    Initial version
  34 * -------------------------------------------------------------------- */
  35
  36 #include "arm_math.h"
  37
  38 /**
  39  * @ingroup groupFilters
  40  */
  41
  42 /**
  43  * @addtogroup FIR
  44  * @{
  45  */
  46
  47 /**
  48  * @param[in] *S points to an instance of the Q31 FIR filter structure.
  49  * @param[in] *pSrc points to the block of input data.
  50  * @param[out] *pDst points to the block of output data.
  51  * @param[in] blockSize number of samples to process per call.
  52  * @return none.
  53  *
  54  * @details
  55  * <b>Scaling and Overflow Behavior:</b>
  56  * \par
  57  * The function is implemented using an internal 64-bit accumulator.
  58  * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
  59  * Thus, if the accumulator result overflows it wraps around rather than clip.
  60  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
  61  * After all multiply-accumulates are performed, the 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
  62  *
  63  * \par
  64  * Refer to the function <code>arm_fir_fast_q31()</code> for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.
  65  */
  66
  67 void arm_fir_q31(
  68   const arm_fir_instance_q31 * S,
  69   q31_t * pSrc,
  70   q31_t * pDst,
  71   uint32_t blockSize)
  72 {
  73   q31_t *pState = S->pState;                     /* State pointer */
  74   q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
  75   q31_t *pStateCurnt;                            /* Points to the current sample of the state */
  76
  77
  78 #ifndef ARM_MATH_CM0
  79
  80   /* Run the below code for Cortex-M4 and Cortex-M3 */
  81
  82   q31_t x0, x1, x2, x3;                          /* Temporary variables to hold state */
  83   q31_t c0;                                      /* Temporary variable to hold coefficient value */
  84   q31_t *px;                                     /* Temporary pointer for state */
  85   q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
  86   q63_t acc0, acc1, acc2, acc3;                  /* Accumulators */
  87   uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
  88   uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
  89
  90   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
  91   /* pStateCurnt points to the location where the new input data should be written */
  92   pStateCurnt = &(S->pState[(numTaps - 1u)]);
  93
  94   /* Apply loop unrolling and compute 4 output values simultaneously.
  95    * The variables acc0 ... acc3 hold output values that are being computed:
  96    *
  97    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
  98    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
  99    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
 100    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
 101    */
 102   blkCnt = blockSize >> 2;
 103
 104   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 105    ** a second loop below computes the remaining 1 to 3 samples. */
 106   while(blkCnt > 0u)
 107   {
 108     /* Copy four new input samples into the state buffer */
 109     *pStateCurnt++ = *pSrc++;
 110     *pStateCurnt++ = *pSrc++;
 111     *pStateCurnt++ = *pSrc++;
 112     *pStateCurnt++ = *pSrc++;
 113
 114     /* Set all accumulators to zero */
 115     acc0 = 0;
 116     acc1 = 0;
 117     acc2 = 0;
 118     acc3 = 0;
 119
 120     /* Initialize state pointer */
 121     px = pState;
 122
 123     /* Initialize coefficient pointer */
 124     pb = pCoeffs;
 125
 126     /* Read the first three samples from the state buffer:
 127      *  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
 128     x0 = *(px++);
 129     x1 = *(px++);
 130     x2 = *(px++);
 131
 132     /* Loop unrolling.  Process 4 taps at a time. */
 133     tapCnt = numTaps >> 2;
 134     i = tapCnt;
 135
 136     while(i > 0u)
 137     {
 138       /* Read the b[numTaps] coefficient */
 139       c0 = *(pb++);
 140
 141       /* Read x[n-numTaps-3] sample */
 142       x3 = *(px++);
 143
 144       /* acc0 +=  b[numTaps] * x[n-numTaps] */
 145       acc0 += ((q63_t) x0 * c0);
 146
 147       /* acc1 +=  b[numTaps] * x[n-numTaps-1] */
 148       acc1 += ((q63_t) x1 * c0);
 149
 150       /* acc2 +=  b[numTaps] * x[n-numTaps-2] */
 151       acc2 += ((q63_t) x2 * c0);
 152
 153       /* acc3 +=  b[numTaps] * x[n-numTaps-3] */
 154       acc3 += ((q63_t) x3 * c0);
 155
 156       /* Read the b[numTaps-1] coefficient */
 157       c0 = *(pb++);
 158
 159       /* Read x[n-numTaps-4] sample */
 160       x0 = *(px++);
 161
 162       /* Perform the multiply-accumulates */
 163       acc0 += ((q63_t) x1 * c0);
 164       acc1 += ((q63_t) x2 * c0);
 165       acc2 += ((q63_t) x3 * c0);
 166       acc3 += ((q63_t) x0 * c0);
 167
 168       /* Read the b[numTaps-2] coefficient */
 169       c0 = *(pb++);
 170
 171       /* Read x[n-numTaps-5] sample */
 172       x1 = *(px++);
 173
 174       /* Perform the multiply-accumulates */
 175       acc0 += ((q63_t) x2 * c0);
 176       acc1 += ((q63_t) x3 * c0);
 177       acc2 += ((q63_t) x0 * c0);
 178       acc3 += ((q63_t) x1 * c0);
 179       /* Read the b[numTaps-3] coefficients */
 180       c0 = *(pb++);
 181
 182       /* Read x[n-numTaps-6] sample */
 183       x2 = *(px++);
 184
 185       /* Perform the multiply-accumulates */
 186       acc0 += ((q63_t) x3 * c0);
 187       acc1 += ((q63_t) x0 * c0);
 188       acc2 += ((q63_t) x1 * c0);
 189       acc3 += ((q63_t) x2 * c0);
 190       i--;
 191     }
 192
 193     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
 194
 195     i = numTaps - (tapCnt * 4u);
 196     while(i > 0u)
 197     {
 198       /* Read coefficients */
 199       c0 = *(pb++);
 200
 201       /* Fetch 1 state variable */
 202       x3 = *(px++);
 203
 204       /* Perform the multiply-accumulates */
 205       acc0 += ((q63_t) x0 * c0);
 206       acc1 += ((q63_t) x1 * c0);
 207       acc2 += ((q63_t) x2 * c0);
 208       acc3 += ((q63_t) x3 * c0);
 209
 210       /* Reuse the present sample states for next sample */
 211       x0 = x1;
 212       x1 = x2;
 213       x2 = x3;
 214
 215       /* Decrement the loop counter */
 216       i--;
 217     }
 218
 219     /* Advance the state pointer by 4 to process the next group of 4 samples */
 220     pState = pState + 4;
 221
 222     /* The results in the 4 accumulators are in 2.62 format.  Convert to 1.31
 223      ** Then store the 4 outputs in the destination buffer. */
 224     *pDst++ = (q31_t) (acc0 >> 31u);
 225     *pDst++ = (q31_t) (acc1 >> 31u);
 226     *pDst++ = (q31_t) (acc2 >> 31u);
 227     *pDst++ = (q31_t) (acc3 >> 31u);
 228
 229     /* Decrement the samples loop counter */
 230     blkCnt--;
 231   }
 232
 233
 234   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
 235    ** No loop unrolling is used. */
 236   blkCnt = blockSize % 4u;
 237
 238   while(blkCnt > 0u)
 239   {
 240     /* Copy one sample at a time into state buffer */
 241     *pStateCurnt++ = *pSrc++;
 242
 243     /* Set the accumulator to zero */
 244     acc0 = 0;
 245
 246     /* Initialize state pointer */
 247     px = pState;
 248
 249     /* Initialize Coefficient pointer */
 250     pb = (pCoeffs);
 251
 252     i = numTaps;
 253
 254     /* Perform the multiply-accumulates */
 255     do
 256     {
 257       acc0 += (q63_t) * (px++) * (*(pb++));
 258       i--;
 259     } while(i > 0u);
 260
 261     /* The result is in 2.62 format.  Convert to 1.31
 262      ** Then store the output in the destination buffer. */
 263     *pDst++ = (q31_t) (acc0 >> 31u);
 264
 265     /* Advance state pointer by 1 for the next sample */
 266     pState = pState + 1;
 267
 268     /* Decrement the samples loop counter */
 269     blkCnt--;
 270   }
 271
 272   /* Processing is complete.
 273    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
 274    ** This prepares the state buffer for the next function call. */
 275
 276   /* Points to the start of the state buffer */
 277   pStateCurnt = S->pState;
 278
 279   tapCnt = (numTaps - 1u) >> 2u;
 280
 281   /* copy data */
 282   while(tapCnt > 0u)
 283   {
 284     *pStateCurnt++ = *pState++;
 285     *pStateCurnt++ = *pState++;
 286     *pStateCurnt++ = *pState++;
 287     *pStateCurnt++ = *pState++;
 288
 289     /* Decrement the loop counter */
 290     tapCnt--;
 291   }
 292
 293   /* Calculate remaining number of copies */
 294   tapCnt = (numTaps - 1u) % 0x4u;
 295
 296   /* Copy the remaining q31_t data */
 297   while(tapCnt > 0u)
 298   {
 299     *pStateCurnt++ = *pState++;
 300
 301     /* Decrement the loop counter */
 302     tapCnt--;
 303   }
 304
 305 #else
 306
 307 /* Run the below code for Cortex-M0 */
 308
 309   q31_t *px;                                     /* Temporary pointer for state */
 310   q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
 311   q63_t acc;                                     /* Accumulator */
 312   uint32_t numTaps = S->numTaps;                 /* Length of the filter */
 313   uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
 314
 315   /* S->pState buffer contains previous frame (numTaps - 1) samples */
 316   /* pStateCurnt points to the location where the new input data should be written */
 317   pStateCurnt = &(S->pState[(numTaps - 1u)]);
 318
 319   /* Initialize blkCnt with blockSize */
 320   blkCnt = blockSize;
 321
 322   while(blkCnt > 0u)
 323   {
 324     /* Copy one sample at a time into state buffer */
 325     *pStateCurnt++ = *pSrc++;
 326
 327     /* Set the accumulator to zero */
 328     acc = 0;
 329
 330     /* Initialize state pointer */
 331     px = pState;
 332
 333     /* Initialize Coefficient pointer */
 334     pb = pCoeffs;
 335
 336     i = numTaps;
 337
 338     /* Perform the multiply-accumulates */
 339     do
 340     {
 341       /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
 342       acc += (q63_t) * px++ * *pb++;
 343       i--;
 344     } while(i > 0u);
 345
 346     /* The result is in 2.62 format.  Convert to 1.31
 347      ** Then store the output in the destination buffer. */
 348     *pDst++ = (q31_t) (acc >> 31u);
 349
 350     /* Advance state pointer by 1 for the next sample */
 351     pState = pState + 1;
 352
 353     /* Decrement the samples loop counter */
 354     blkCnt--;
 355   }
 356
 357   /* Processing is complete.
 358    ** Now copy the last numTaps - 1 samples to the starting of the state buffer.
 359    ** This prepares the state buffer for the next function call. */
 360
 361   /* Points to the start of the state buffer */
 362   pStateCurnt = S->pState;
 363
 364   /* Copy numTaps number of values */
 365   tapCnt = numTaps - 1u;
 366
 367   /* Copy the data */
 368   while(tapCnt > 0u)
 369   {
 370     *pStateCurnt++ = *pState++;
 371
 372     /* Decrement the loop counter */
 373     tapCnt--;
 374   }
 375
 376
 377 #endif /*  #ifndef ARM_MATH_CM0 */
 378
 379 }
 380
 381 /**
 382  * @} end of FIR group
 383  */