1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010 ARM Limited. All rights reserved.
7 * Project: CMSIS DSP Library
8 * Title: arm_fir_sparse_q31.c
10 * Description: Q31 sparse FIR filter processing function.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Version 1.0.10 2011/7/15
15 * Big Endian support added and Merged M0 and M3/M4 Source code.
17 * Version 1.0.3 2010/11/29
18 * Re-organized the CMSIS folders and updated documentation.
20 * Version 1.0.2 2010/11/11
21 * Documentation updated.
23 * Version 1.0.1 2010/10/05
24 * Production release and review comments incorporated.
26 * Version 1.0.0 2010/09/20
27 * Production release and review comments incorporated
29 * Version 0.0.7 2010/06/10
30 * Misra-C changes done
31 * ------------------------------------------------------------------- */
36 * @addtogroup FIR_Sparse
41 * @brief Processing function for the Q31 sparse FIR filter.
42 * @param[in] *S points to an instance of the Q31 sparse FIR structure.
43 * @param[in] *pSrc points to the block of input data.
44 * @param[out] *pDst points to the block of output data
45 * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
46 * @param[in] blockSize number of input samples to process per call.
49 * <b>Scaling and Overflow Behavior:</b>
51 * The function is implemented using an internal 32-bit accumulator.
52 * The 1.31 x 1.31 multiplications are truncated to 2.30 format.
53 * This leads to loss of precision on the intermediate multiplications and provides only a single guard bit.
54 * If the accumulator result overflows, it wraps around rather than saturate.
55 * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
58 void arm_fir_sparse_q31(
59 arm_fir_sparse_instance_q31 * S,
66 q31_t *pState = S->pState; /* State pointer */
67 q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
68 q31_t *px; /* Scratch buffer pointer */
69 q31_t *py = pState; /* Temporary pointers for state buffer */
70 q31_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
71 q31_t *pOut; /* Destination pointer */
72 q63_t out; /* Temporary output variable */
73 int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
74 uint32_t delaySize = S->maxDelay + blockSize; /* state length */
75 uint16_t numTaps = S->numTaps; /* Filter order */
76 int32_t readIndex; /* Read index of the state buffer */
77 uint32_t tapCnt, blkCnt; /* loop counters */
78 q31_t coeff = *pCoeffs++; /* Read the first coefficient value */
82 /* BlockSize of Input samples are copied into the state buffer */
83 /* StateIndex points to the starting position to write in the state buffer */
84 arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
85 (int32_t *) pSrc, 1, blockSize);
87 /* Read Index, from where the state buffer should be read, is calculated. */
88 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
90 /* Wraparound of readIndex */
93 readIndex += (int32_t) delaySize;
96 /* Working pointer for state buffer is updated */
99 /* blockSize samples are read from the state buffer */
100 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
101 (int32_t *) pb, (int32_t *) pb, blockSize, 1,
104 /* Working pointer for the scratch buffer of state values */
107 /* Working pointer for scratch buffer of output values */
113 /* Run the below code for Cortex-M4 and Cortex-M3 */
115 /* Loop over the blockSize. Unroll by a factor of 4.
116 * Compute 4 Multiplications at a time. */
117 blkCnt = blockSize >> 2;
121 /* Perform Multiplications and store in the destination buffer */
122 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
123 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
124 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
125 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
127 /* Decrement the loop counter */
131 /* If the blockSize is not a multiple of 4,
132 * compute the remaining samples */
133 blkCnt = blockSize % 0x4u;
137 /* Perform Multiplications and store in the destination buffer */
138 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
140 /* Decrement the loop counter */
144 /* Load the coefficient value and
145 * increment the coefficient buffer for the next set of state values */
148 /* Read Index, from where the state buffer should be read, is calculated. */
149 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
151 /* Wraparound of readIndex */
154 readIndex += (int32_t) delaySize;
157 /* Loop over the number of taps. */
158 tapCnt = (uint32_t) numTaps - 1u;
162 /* Working pointer for state buffer is updated */
165 /* blockSize samples are read from the state buffer */
166 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
167 (int32_t *) pb, (int32_t *) pb, blockSize, 1,
170 /* Working pointer for the scratch buffer of state values */
173 /* Working pointer for scratch buffer of output values */
176 /* Loop over the blockSize. Unroll by a factor of 4.
177 * Compute 4 MACS at a time. */
178 blkCnt = blockSize >> 2;
183 out += ((q63_t) * px++ * coeff) >> 32;
184 *pOut++ = (q31_t) (out);
187 out += ((q63_t) * px++ * coeff) >> 32;
188 *pOut++ = (q31_t) (out);
191 out += ((q63_t) * px++ * coeff) >> 32;
192 *pOut++ = (q31_t) (out);
195 out += ((q63_t) * px++ * coeff) >> 32;
196 *pOut++ = (q31_t) (out);
198 /* Decrement the loop counter */
202 /* If the blockSize is not a multiple of 4,
203 * compute the remaining samples */
204 blkCnt = blockSize % 0x4u;
208 /* Perform Multiply-Accumulate */
210 out += ((q63_t) * px++ * coeff) >> 32;
211 *pOut++ = (q31_t) (out);
213 /* Decrement the loop counter */
217 /* Load the coefficient value and
218 * increment the coefficient buffer for the next set of state values */
221 /* Read Index, from where the state buffer should be read, is calculated. */
222 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
224 /* Wraparound of readIndex */
227 readIndex += (int32_t) delaySize;
230 /* Decrement the tap loop counter */
234 /* Working output pointer is updated */
237 /* Output is converted into 1.31 format. */
238 /* Loop over the blockSize. Unroll by a factor of 4.
239 * process 4 output samples at a time. */
240 blkCnt = blockSize >> 2;
253 /* Decrement the loop counter */
257 /* If the blockSize is not a multiple of 4,
258 * process the remaining output samples */
259 blkCnt = blockSize % 0x4u;
266 /* Decrement the loop counter */
272 /* Run the below code for Cortex-M0 */
277 /* Perform Multiplications and store in the destination buffer */
278 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
280 /* Decrement the loop counter */
284 /* Load the coefficient value and
285 * increment the coefficient buffer for the next set of state values */
288 /* Read Index, from where the state buffer should be read, is calculated. */
289 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
291 /* Wraparound of readIndex */
294 readIndex += (int32_t) delaySize;
297 /* Loop over the number of taps. */
298 tapCnt = (uint32_t) numTaps - 1u;
302 /* Working pointer for state buffer is updated */
305 /* blockSize samples are read from the state buffer */
306 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
307 (int32_t *) pb, (int32_t *) pb, blockSize, 1,
310 /* Working pointer for the scratch buffer of state values */
313 /* Working pointer for scratch buffer of output values */
320 /* Perform Multiply-Accumulate */
322 out += ((q63_t) * px++ * coeff) >> 32;
323 *pOut++ = (q31_t) (out);
325 /* Decrement the loop counter */
329 /* Load the coefficient value and
330 * increment the coefficient buffer for the next set of state values */
333 /* Read Index, from where the state buffer should be read, is calculated. */
334 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
336 /* Wraparound of readIndex */
339 readIndex += (int32_t) delaySize;
342 /* Decrement the tap loop counter */
346 /* Working output pointer is updated */
349 /* Output is converted into 1.31 format. */
357 /* Decrement the loop counter */
361 #endif /* #ifndef ARM_MATH_CM0 */
366 * @} end of FIR_Sparse group