1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010 ARM Limited. All rights reserved.
7 * Project: CMSIS DSP Library
10 * Description: Q31 FIR filter processing function.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Version 1.0.10 2011/7/15
15 * Big Endian support added and Merged M0 and M3/M4 Source code.
17 * Version 1.0.3 2010/11/29
18 * Re-organized the CMSIS folders and updated documentation.
20 * Version 1.0.2 2010/11/11
21 * Documentation updated.
23 * Version 1.0.1 2010/10/05
24 * Production release and review comments incorporated.
26 * Version 1.0.0 2010/09/20
27 * Production release and review comments incorporated.
29 * Version 0.0.5 2010/04/26
30 * incorporated review comments and updated with latest CMSIS layer
32 * Version 0.0.3 2010/03/10
34 * -------------------------------------------------------------------- */
39 * @ingroup groupFilters
48 * @param[in] *S points to an instance of the Q31 FIR filter structure.
49 * @param[in] *pSrc points to the block of input data.
50 * @param[out] *pDst points to the block of output data.
51 * @param[in] blockSize number of samples to process per call.
55 * <b>Scaling and Overflow Behavior:</b>
57 * The function is implemented using an internal 64-bit accumulator.
58 * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
59 * Thus, if the accumulator result overflows it wraps around rather than clip.
60 * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
61 * After all multiply-accumulates are performed, the 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
64 * Refer to the function <code>arm_fir_fast_q31()</code> for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.
68 const arm_fir_instance_q31 * S,
73 q31_t *pState = S->pState; /* State pointer */
74 q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
75 q31_t *pStateCurnt; /* Points to the current sample of the state */
80 /* Run the below code for Cortex-M4 and Cortex-M3 */
82 q31_t x0, x1, x2, x3; /* Temporary variables to hold state */
83 q31_t c0; /* Temporary variable to hold coefficient value */
84 q31_t *px; /* Temporary pointer for state */
85 q31_t *pb; /* Temporary pointer for coefficient buffer */
86 q63_t acc0, acc1, acc2, acc3; /* Accumulators */
87 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
88 uint32_t i, tapCnt, blkCnt; /* Loop counters */
90 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
91 /* pStateCurnt points to the location where the new input data should be written */
92 pStateCurnt = &(S->pState[(numTaps - 1u)]);
94 /* Apply loop unrolling and compute 4 output values simultaneously.
95 * The variables acc0 ... acc3 hold output values that are being computed:
97 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
98 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
99 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
100 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
102 blkCnt = blockSize >> 2;
104 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
105 ** a second loop below computes the remaining 1 to 3 samples. */
108 /* Copy four new input samples into the state buffer */
109 *pStateCurnt++ = *pSrc++;
110 *pStateCurnt++ = *pSrc++;
111 *pStateCurnt++ = *pSrc++;
112 *pStateCurnt++ = *pSrc++;
114 /* Set all accumulators to zero */
120 /* Initialize state pointer */
123 /* Initialize coefficient pointer */
126 /* Read the first three samples from the state buffer:
127 * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
132 /* Loop unrolling. Process 4 taps at a time. */
133 tapCnt = numTaps >> 2;
138 /* Read the b[numTaps] coefficient */
141 /* Read x[n-numTaps-3] sample */
144 /* acc0 += b[numTaps] * x[n-numTaps] */
145 acc0 += ((q63_t) x0 * c0);
147 /* acc1 += b[numTaps] * x[n-numTaps-1] */
148 acc1 += ((q63_t) x1 * c0);
150 /* acc2 += b[numTaps] * x[n-numTaps-2] */
151 acc2 += ((q63_t) x2 * c0);
153 /* acc3 += b[numTaps] * x[n-numTaps-3] */
154 acc3 += ((q63_t) x3 * c0);
156 /* Read the b[numTaps-1] coefficient */
159 /* Read x[n-numTaps-4] sample */
162 /* Perform the multiply-accumulates */
163 acc0 += ((q63_t) x1 * c0);
164 acc1 += ((q63_t) x2 * c0);
165 acc2 += ((q63_t) x3 * c0);
166 acc3 += ((q63_t) x0 * c0);
168 /* Read the b[numTaps-2] coefficient */
171 /* Read x[n-numTaps-5] sample */
174 /* Perform the multiply-accumulates */
175 acc0 += ((q63_t) x2 * c0);
176 acc1 += ((q63_t) x3 * c0);
177 acc2 += ((q63_t) x0 * c0);
178 acc3 += ((q63_t) x1 * c0);
179 /* Read the b[numTaps-3] coefficients */
182 /* Read x[n-numTaps-6] sample */
185 /* Perform the multiply-accumulates */
186 acc0 += ((q63_t) x3 * c0);
187 acc1 += ((q63_t) x0 * c0);
188 acc2 += ((q63_t) x1 * c0);
189 acc3 += ((q63_t) x2 * c0);
193 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
195 i = numTaps - (tapCnt * 4u);
198 /* Read coefficients */
201 /* Fetch 1 state variable */
204 /* Perform the multiply-accumulates */
205 acc0 += ((q63_t) x0 * c0);
206 acc1 += ((q63_t) x1 * c0);
207 acc2 += ((q63_t) x2 * c0);
208 acc3 += ((q63_t) x3 * c0);
210 /* Reuse the present sample states for next sample */
215 /* Decrement the loop counter */
219 /* Advance the state pointer by 4 to process the next group of 4 samples */
222 /* The results in the 4 accumulators are in 2.62 format. Convert to 1.31
223 ** Then store the 4 outputs in the destination buffer. */
224 *pDst++ = (q31_t) (acc0 >> 31u);
225 *pDst++ = (q31_t) (acc1 >> 31u);
226 *pDst++ = (q31_t) (acc2 >> 31u);
227 *pDst++ = (q31_t) (acc3 >> 31u);
229 /* Decrement the samples loop counter */
234 /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
235 ** No loop unrolling is used. */
236 blkCnt = blockSize % 4u;
240 /* Copy one sample at a time into state buffer */
241 *pStateCurnt++ = *pSrc++;
243 /* Set the accumulator to zero */
246 /* Initialize state pointer */
249 /* Initialize Coefficient pointer */
254 /* Perform the multiply-accumulates */
257 acc0 += (q63_t) * (px++) * (*(pb++));
261 /* The result is in 2.62 format. Convert to 1.31
262 ** Then store the output in the destination buffer. */
263 *pDst++ = (q31_t) (acc0 >> 31u);
265 /* Advance state pointer by 1 for the next sample */
268 /* Decrement the samples loop counter */
272 /* Processing is complete.
273 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
274 ** This prepares the state buffer for the next function call. */
276 /* Points to the start of the state buffer */
277 pStateCurnt = S->pState;
279 tapCnt = (numTaps - 1u) >> 2u;
284 *pStateCurnt++ = *pState++;
285 *pStateCurnt++ = *pState++;
286 *pStateCurnt++ = *pState++;
287 *pStateCurnt++ = *pState++;
289 /* Decrement the loop counter */
293 /* Calculate remaining number of copies */
294 tapCnt = (numTaps - 1u) % 0x4u;
296 /* Copy the remaining q31_t data */
299 *pStateCurnt++ = *pState++;
301 /* Decrement the loop counter */
307 /* Run the below code for Cortex-M0 */
309 q31_t *px; /* Temporary pointer for state */
310 q31_t *pb; /* Temporary pointer for coefficient buffer */
311 q63_t acc; /* Accumulator */
312 uint32_t numTaps = S->numTaps; /* Length of the filter */
313 uint32_t i, tapCnt, blkCnt; /* Loop counters */
315 /* S->pState buffer contains previous frame (numTaps - 1) samples */
316 /* pStateCurnt points to the location where the new input data should be written */
317 pStateCurnt = &(S->pState[(numTaps - 1u)]);
319 /* Initialize blkCnt with blockSize */
324 /* Copy one sample at a time into state buffer */
325 *pStateCurnt++ = *pSrc++;
327 /* Set the accumulator to zero */
330 /* Initialize state pointer */
333 /* Initialize Coefficient pointer */
338 /* Perform the multiply-accumulates */
341 /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
342 acc += (q63_t) * px++ * *pb++;
346 /* The result is in 2.62 format. Convert to 1.31
347 ** Then store the output in the destination buffer. */
348 *pDst++ = (q31_t) (acc >> 31u);
350 /* Advance state pointer by 1 for the next sample */
353 /* Decrement the samples loop counter */
357 /* Processing is complete.
358 ** Now copy the last numTaps - 1 samples to the starting of the state buffer.
359 ** This prepares the state buffer for the next function call. */
361 /* Points to the start of the state buffer */
362 pStateCurnt = S->pState;
364 /* Copy numTaps number of values */
365 tapCnt = numTaps - 1u;
370 *pStateCurnt++ = *pState++;
372 /* Decrement the loop counter */
377 #endif /* #ifndef ARM_MATH_CM0 */
382 * @} end of FIR group