1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010 ARM Limited. All rights reserved.
7 * Project: CMSIS DSP Library
8 * Title: arm_fir_sparse_q7.c
10 * Description: Q7 sparse FIR filter processing function.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Version 1.0.10 2011/7/15
15 * Big Endian support added and Merged M0 and M3/M4 Source code.
17 * Version 1.0.3 2010/11/29
18 * Re-organized the CMSIS folders and updated documentation.
20 * Version 1.0.2 2010/11/11
21 * Documentation updated.
23 * Version 1.0.1 2010/10/05
24 * Production release and review comments incorporated.
26 * Version 1.0.0 2010/09/20
27 * Production release and review comments incorporated
29 * Version 0.0.7 2010/06/10
30 * Misra-C changes done
31 * ------------------------------------------------------------------- */
36 * @ingroup groupFilters
40 * @addtogroup FIR_Sparse
46 * @brief Processing function for the Q7 sparse FIR filter.
47 * @param[in] *S points to an instance of the Q7 sparse FIR structure.
48 * @param[in] *pSrc points to the block of input data.
49 * @param[out] *pDst points to the block of output data
50 * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
51 * @param[in] *pScratchOut points to a temporary buffer of size blockSize.
52 * @param[in] blockSize number of input samples to process per call.
55 * <b>Scaling and Overflow Behavior:</b>
57 * The function is implemented using a 32-bit internal accumulator.
58 * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
59 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
60 * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
61 * The accumulator is then converted to 18.7 format by discarding the low 7 bits.
62 * Finally, the result is truncated to 1.7 format.
65 void arm_fir_sparse_q7(
66 arm_fir_sparse_instance_q7 * S,
74 q7_t *pState = S->pState; /* State pointer */
75 q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
76 q7_t *px; /* Scratch buffer pointer */
77 q7_t *py = pState; /* Temporary pointers for state buffer */
78 q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
79 q7_t *pOut = pDst; /* Destination pointer */
80 int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
81 uint32_t delaySize = S->maxDelay + blockSize; /* state length */
82 uint16_t numTaps = S->numTaps; /* Filter order */
83 int32_t readIndex; /* Read index of the state buffer */
84 uint32_t tapCnt, blkCnt; /* loop counters */
85 q7_t coeff = *pCoeffs++; /* Read the coefficient value */
86 q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */
92 /* Run the below code for Cortex-M4 and Cortex-M3 */
94 q7_t in1, in2, in3, in4;
96 /* BlockSize of Input samples are copied into the state buffer */
97 /* StateIndex points to the starting position to write in the state buffer */
98 arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
101 /* Loop over the number of taps. */
104 /* Read Index, from where the state buffer should be read, is calculated. */
105 readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
107 /* Wraparound of readIndex */
110 readIndex += (int32_t) delaySize;
113 /* Working pointer for state buffer is updated */
116 /* blockSize samples are read from the state buffer */
117 arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
118 (int32_t) blockSize, 1, blockSize);
120 /* Working pointer for the scratch buffer of state values */
123 /* Working pointer for scratch buffer of output values */
126 /* Loop over the blockSize. Unroll by a factor of 4.
127 * Compute 4 multiplications at a time. */
128 blkCnt = blockSize >> 2;
132 /* Perform multiplication and store in the scratch buffer */
133 *pScratchOut++ = ((q31_t) * px++ * coeff);
134 *pScratchOut++ = ((q31_t) * px++ * coeff);
135 *pScratchOut++ = ((q31_t) * px++ * coeff);
136 *pScratchOut++ = ((q31_t) * px++ * coeff);
138 /* Decrement the loop counter */
142 /* If the blockSize is not a multiple of 4,
143 * compute the remaining samples */
144 blkCnt = blockSize % 0x4u;
148 /* Perform multiplication and store in the scratch buffer */
149 *pScratchOut++ = ((q31_t) * px++ * coeff);
151 /* Decrement the loop counter */
155 /* Load the coefficient value and
156 * increment the coefficient buffer for the next set of state values */
159 /* Read Index, from where the state buffer should be read, is calculated. */
160 readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
162 /* Wraparound of readIndex */
165 readIndex += (int32_t) delaySize;
168 /* Loop over the number of taps. */
169 tapCnt = (uint32_t) numTaps - 1u;
173 /* Working pointer for state buffer is updated */
176 /* blockSize samples are read from the state buffer */
177 arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
178 (int32_t) blockSize, 1, blockSize);
180 /* Working pointer for the scratch buffer of state values */
183 /* Working pointer for scratch buffer of output values */
186 /* Loop over the blockSize. Unroll by a factor of 4.
187 * Compute 4 MACS at a time. */
188 blkCnt = blockSize >> 2;
192 /* Perform Multiply-Accumulate */
193 in = *pScratchOut + ((q31_t) * px++ * coeff);
195 in = *pScratchOut + ((q31_t) * px++ * coeff);
197 in = *pScratchOut + ((q31_t) * px++ * coeff);
199 in = *pScratchOut + ((q31_t) * px++ * coeff);
202 /* Decrement the loop counter */
206 /* If the blockSize is not a multiple of 4,
207 * compute the remaining samples */
208 blkCnt = blockSize % 0x4u;
212 /* Perform Multiply-Accumulate */
213 in = *pScratchOut + ((q31_t) * px++ * coeff);
216 /* Decrement the loop counter */
220 /* Load the coefficient value and
221 * increment the coefficient buffer for the next set of state values */
224 /* Read Index, from where the state buffer should be read, is calculated. */
225 readIndex = ((int32_t) S->stateIndex -
226 (int32_t) blockSize) - *pTapDelay++;
228 /* Wraparound of readIndex */
231 readIndex += (int32_t) delaySize;
234 /* Decrement the tap loop counter */
238 /* All the output values are in pScratchOut buffer.
239 Convert them into 1.15 format, saturate and store in the destination buffer. */
240 /* Loop over the blockSize. */
241 blkCnt = blockSize >> 2;
245 in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
246 in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
247 in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
248 in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
250 *__SIMD32(pOut)++ = __PACKq7(in1, in2, in3, in4);
252 /* Decrement the blockSize loop counter */
256 /* If the blockSize is not a multiple of 4,
257 remaining samples are processed in the below loop */
258 blkCnt = blockSize % 0x4u;
262 *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
264 /* Decrement the blockSize loop counter */
270 /* Run the below code for Cortex-M0 */
272 /* BlockSize of Input samples are copied into the state buffer */
273 /* StateIndex points to the starting position to write in the state buffer */
274 arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
277 /* Loop over the number of taps. */
280 /* Read Index, from where the state buffer should be read, is calculated. */
281 readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
283 /* Wraparound of readIndex */
286 readIndex += (int32_t) delaySize;
289 /* Working pointer for state buffer is updated */
292 /* blockSize samples are read from the state buffer */
293 arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
294 (int32_t) blockSize, 1, blockSize);
296 /* Working pointer for the scratch buffer of state values */
299 /* Working pointer for scratch buffer of output values */
302 /* Loop over the blockSize */
307 /* Perform multiplication and store in the scratch buffer */
308 *pScratchOut++ = ((q31_t) * px++ * coeff);
310 /* Decrement the loop counter */
314 /* Load the coefficient value and
315 * increment the coefficient buffer for the next set of state values */
318 /* Read Index, from where the state buffer should be read, is calculated. */
319 readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
321 /* Wraparound of readIndex */
324 readIndex += (int32_t) delaySize;
327 /* Loop over the number of taps. */
328 tapCnt = (uint32_t) numTaps - 1u;
332 /* Working pointer for state buffer is updated */
335 /* blockSize samples are read from the state buffer */
336 arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
337 (int32_t) blockSize, 1, blockSize);
339 /* Working pointer for the scratch buffer of state values */
342 /* Working pointer for scratch buffer of output values */
345 /* Loop over the blockSize */
350 /* Perform Multiply-Accumulate */
351 in = *pScratchOut + ((q31_t) * px++ * coeff);
354 /* Decrement the loop counter */
358 /* Load the coefficient value and
359 * increment the coefficient buffer for the next set of state values */
362 /* Read Index, from where the state buffer should be read, is calculated. */
364 ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
366 /* Wraparound of readIndex */
369 readIndex += (int32_t) delaySize;
372 /* Decrement the tap loop counter */
376 /* All the output values are in pScratchOut buffer.
377 Convert them into 1.15 format, saturate and store in the destination buffer. */
378 /* Loop over the blockSize. */
383 *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
385 /* Decrement the blockSize loop counter */
389 #endif /* #ifndef ARM_MATH_CM0 */
394 * @} end of FIR_Sparse group