1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010 ARM Limited. All rights reserved.
7 * Project: CMSIS DSP Library
8 * Title: arm_lms_norm_q15.c
10 * Description: Q15 NLMS filter.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Version 1.0.10 2011/7/15
15 * Big Endian support added and Merged M0 and M3/M4 Source code.
17 * Version 1.0.3 2010/11/29
18 * Re-organized the CMSIS folders and updated documentation.
20 * Version 1.0.2 2010/11/11
21 * Documentation updated.
23 * Version 1.0.1 2010/10/05
24 * Production release and review comments incorporated.
26 * Version 1.0.0 2010/09/20
27 * Production release and review comments incorporated
29 * Version 0.0.7 2010/06/10
30 * Misra-C changes done
31 * -------------------------------------------------------------------- */
36 * @ingroup groupFilters
40 * @addtogroup LMS_NORM
45 * @brief Processing function for Q15 normalized LMS filter.
46 * @param[in] *S points to an instance of the Q15 normalized LMS filter structure.
47 * @param[in] *pSrc points to the block of input data.
48 * @param[in] *pRef points to the block of reference data.
49 * @param[out] *pOut points to the block of output data.
50 * @param[out] *pErr points to the block of error data.
51 * @param[in] blockSize number of samples to process.
54 * <b>Scaling and Overflow Behavior:</b>
56 * The function is implemented using a 64-bit internal accumulator.
57 * Both coefficients and state variables are represented in 1.15 format and
58 * multiplications yield a 2.30 result. The 2.30 intermediate results are
59 * accumulated in a 64-bit accumulator in 34.30 format.
60 * There is no risk of internal overflow with this approach and the full
61 * precision of intermediate multiplications is preserved. After all additions
62 * have been performed, the accumulator is truncated to 34.15 format by
63 * discarding low 15 bits. Lastly, the accumulator is saturated to yield a
64 * result in 1.15 format.
67 * In this filter, filter coefficients are updated for each sample and the updation of filter cofficients are saturted.
71 void arm_lms_norm_q15(
72 arm_lms_norm_instance_q15 * S,
79 q15_t *pState = S->pState; /* State pointer */
80 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
81 q15_t *pStateCurnt; /* Points to the current sample of the state */
82 q15_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
83 q15_t mu = S->mu; /* Adaptive factor */
84 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
85 uint32_t tapCnt, blkCnt; /* Loop counters */
86 q31_t energy; /* Energy of the input */
87 q63_t acc; /* Accumulator */
88 q15_t e = 0, d = 0; /* error, reference data sample */
89 q15_t w = 0, in; /* weight factor and state */
90 q15_t x0; /* temporary variable to hold input sample */
91 uint32_t shift = (uint32_t) S->postShift + 1u; /* Shift to be applied to the output */
92 q15_t errorXmu, oneByEnergy; /* Temporary variables to store error and mu product and reciprocal of energy */
93 q15_t postShift; /* Post shift to be applied to weight after reciprocal calculation */
94 q31_t coef; /* Teporary variable for coefficient */
99 /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
100 /* pStateCurnt points to the location where the new input data should be written */
101 pStateCurnt = &(S->pState[(numTaps - 1u)]);
103 /* Loop over blockSize number of values */
109 /* Run the below code for Cortex-M4 and Cortex-M3 */
113 /* Copy the new input sample into the state buffer */
114 *pStateCurnt++ = *pSrc;
116 /* Initialize pState pointer */
119 /* Initialize coeff pointer */
122 /* Read the sample from input buffer */
125 /* Update the energy calculation */
126 energy -= (((q31_t) x0 * (x0)) >> 15);
127 energy += (((q31_t) in * (in)) >> 15);
129 /* Set the accumulator to zero */
132 /* Loop unrolling. Process 4 taps at a time. */
133 tapCnt = numTaps >> 2;
138 /* Perform the multiply-accumulate */
139 acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc);
140 acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc);
142 /* Decrement the loop counter */
146 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
147 tapCnt = numTaps % 0x4u;
151 /* Perform the multiply-accumulate */
152 acc += (((q31_t) * px++ * (*pb++)));
154 /* Decrement the loop counter */
158 /* Converting the result to 1.15 format */
159 acc = __SSAT((acc >> (16u - shift)), 16u);
161 /* Store the result from accumulator into the destination buffer. */
162 *pOut++ = (q15_t) acc;
164 /* Compute and store error */
169 /* Calculation of 1/energy */
170 postShift = arm_recip_q15((q15_t) energy + DELTA_Q15,
171 &oneByEnergy, S->recipTable);
173 /* Calculation of e * mu value */
174 errorXmu = (q15_t) (((q31_t) e * mu) >> 15);
176 /* Calculation of (e * mu) * (1/energy) value */
177 acc = (((q31_t) errorXmu * oneByEnergy) >> (15 - postShift));
179 /* Weighting factor for the normalized version */
180 w = (q15_t) __SSAT((q31_t) acc, 16);
182 /* Initialize pState pointer */
185 /* Initialize coeff pointer */
188 /* Loop unrolling. Process 4 taps at a time. */
189 tapCnt = numTaps >> 2;
191 /* Update filter coefficients */
194 coef = *pb + (((q31_t) w * (*px++)) >> 15);
195 *pb++ = (q15_t) __SSAT((coef), 16);
196 coef = *pb + (((q31_t) w * (*px++)) >> 15);
197 *pb++ = (q15_t) __SSAT((coef), 16);
198 coef = *pb + (((q31_t) w * (*px++)) >> 15);
199 *pb++ = (q15_t) __SSAT((coef), 16);
200 coef = *pb + (((q31_t) w * (*px++)) >> 15);
201 *pb++ = (q15_t) __SSAT((coef), 16);
203 /* Decrement the loop counter */
207 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
208 tapCnt = numTaps % 0x4u;
212 /* Perform the multiply-accumulate */
213 coef = *pb + (((q31_t) w * (*px++)) >> 15);
214 *pb++ = (q15_t) __SSAT((coef), 16);
216 /* Decrement the loop counter */
220 /* Read the sample from state buffer */
223 /* Advance state pointer by 1 for the next sample */
224 pState = pState + 1u;
226 /* Decrement the loop counter */
230 /* Save energy and x0 values for the next frame */
231 S->energy = (q15_t) energy;
234 /* Processing is complete. Now copy the last numTaps - 1 samples to the
235 satrt of the state buffer. This prepares the state buffer for the
236 next function call. */
238 /* Points to the start of the pState buffer */
239 pStateCurnt = S->pState;
241 /* Calculation of count for copying integer writes */
242 tapCnt = (numTaps - 1u) >> 2;
247 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
248 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
254 /* Calculation of count for remaining q15_t data */
255 tapCnt = (numTaps - 1u) % 0x4u;
260 *pStateCurnt++ = *pState++;
262 /* Decrement the loop counter */
268 /* Run the below code for Cortex-M0 */
272 /* Copy the new input sample into the state buffer */
273 *pStateCurnt++ = *pSrc;
275 /* Initialize pState pointer */
278 /* Initialize pCoeffs pointer */
281 /* Read the sample from input buffer */
284 /* Update the energy calculation */
285 energy -= (((q31_t) x0 * (x0)) >> 15);
286 energy += (((q31_t) in * (in)) >> 15);
288 /* Set the accumulator to zero */
291 /* Loop over numTaps number of values */
296 /* Perform the multiply-accumulate */
297 acc += (((q31_t) * px++ * (*pb++)));
299 /* Decrement the loop counter */
303 /* Converting the result to 1.15 format */
304 acc = __SSAT((acc >> (16u - shift)), 16u);
306 /* Store the result from accumulator into the destination buffer. */
307 *pOut++ = (q15_t) acc;
309 /* Compute and store error */
314 /* Calculation of 1/energy */
315 postShift = arm_recip_q15((q15_t) energy + DELTA_Q15,
316 &oneByEnergy, S->recipTable);
318 /* Calculation of e * mu value */
319 errorXmu = (q15_t) (((q31_t) e * mu) >> 15);
321 /* Calculation of (e * mu) * (1/energy) value */
322 acc = (((q31_t) errorXmu * oneByEnergy) >> (15 - postShift));
324 /* Weighting factor for the normalized version */
325 w = (q15_t) __SSAT((q31_t) acc, 16);
327 /* Initialize pState pointer */
330 /* Initialize coeff pointer */
333 /* Loop over numTaps number of values */
338 /* Perform the multiply-accumulate */
339 coef = *pb + (((q31_t) w * (*px++)) >> 15);
340 *pb++ = (q15_t) __SSAT((coef), 16);
342 /* Decrement the loop counter */
346 /* Read the sample from state buffer */
349 /* Advance state pointer by 1 for the next sample */
350 pState = pState + 1u;
352 /* Decrement the loop counter */
356 /* Save energy and x0 values for the next frame */
357 S->energy = (q15_t) energy;
360 /* Processing is complete. Now copy the last numTaps - 1 samples to the
361 satrt of the state buffer. This prepares the state buffer for the
362 next function call. */
364 /* Points to the start of the pState buffer */
365 pStateCurnt = S->pState;
367 /* copy (numTaps - 1u) data */
368 tapCnt = (numTaps - 1u);
373 *pStateCurnt++ = *pState++;
375 /* Decrement the loop counter */
379 #endif /* #ifndef ARM_MATH_CM0 */
385 * @} end of LMS_NORM group