1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010 ARM Limited. All rights reserved.
7 * Project: CMSIS DSP Library
8 * Title: arm_lms_norm_q31.c
10 * Description: Processing function for the Q31 NLMS filter.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Version 1.0.10 2011/7/15
15 * Big Endian support added and Merged M0 and M3/M4 Source code.
17 * Version 1.0.3 2010/11/29
18 * Re-organized the CMSIS folders and updated documentation.
20 * Version 1.0.2 2010/11/11
21 * Documentation updated.
23 * Version 1.0.1 2010/10/05
24 * Production release and review comments incorporated.
26 * Version 1.0.0 2010/09/20
27 * Production release and review comments incorporated
29 * Version 0.0.7 2010/06/10
30 * Misra-C changes done
31 * -------------------------------------------------------------------- */
36 * @ingroup groupFilters
40 * @addtogroup LMS_NORM
45 * @brief Processing function for Q31 normalized LMS filter.
46 * @param[in] *S points to an instance of the Q31 normalized LMS filter structure.
47 * @param[in] *pSrc points to the block of input data.
48 * @param[in] *pRef points to the block of reference data.
49 * @param[out] *pOut points to the block of output data.
50 * @param[out] *pErr points to the block of error data.
51 * @param[in] blockSize number of samples to process.
54 * <b>Scaling and Overflow Behavior:</b>
56 * The function is implemented using an internal 64-bit accumulator.
57 * The accumulator has a 2.62 format and maintains full precision of the intermediate
58 * multiplication results but provides only a single guard bit.
59 * Thus, if the accumulator result overflows it wraps around rather than clip.
60 * In order to avoid overflows completely the input signal must be scaled down by
61 * log2(numTaps) bits. The reference signal should not be scaled down.
62 * After all multiply-accumulates are performed, the 2.62 accumulator is shifted
63 * and saturated to 1.31 format to yield the final result.
64 * The output signal and error signal are in 1.31 format.
67 * In this filter, filter coefficients are updated for each sample and the
68 * updation of filter cofficients are saturted.
72 void arm_lms_norm_q31(
73 arm_lms_norm_instance_q31 * S,
80 q31_t *pState = S->pState; /* State pointer */
81 q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
82 q31_t *pStateCurnt; /* Points to the current sample of the state */
83 q31_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
84 q31_t mu = S->mu; /* Adaptive factor */
85 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
86 uint32_t tapCnt, blkCnt; /* Loop counters */
87 q63_t energy; /* Energy of the input */
88 q63_t acc; /* Accumulator */
89 q31_t e = 0, d = 0; /* error, reference data sample */
90 q31_t w = 0, in; /* weight factor and state */
91 q31_t x0; /* temporary variable to hold input sample */
92 uint32_t shift = 32u - ((uint32_t) S->postShift + 1u); /* Shift to be applied to the output */
93 q31_t errorXmu, oneByEnergy; /* Temporary variables to store error and mu product and reciprocal of energy */
94 q31_t postShift; /* Post shift to be applied to weight after reciprocal calculation */
95 q31_t coef; /* Temporary variable for coef */
100 /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
101 /* pStateCurnt points to the location where the new input data should be written */
102 pStateCurnt = &(S->pState[(numTaps - 1u)]);
104 /* Loop over blockSize number of values */
110 /* Run the below code for Cortex-M4 and Cortex-M3 */
115 /* Copy the new input sample into the state buffer */
116 *pStateCurnt++ = *pSrc;
118 /* Initialize pState pointer */
121 /* Initialize coeff pointer */
124 /* Read the sample from input buffer */
127 /* Update the energy calculation */
128 energy = (q31_t) ((((q63_t) energy << 32) -
129 (((q63_t) x0 * x0) << 1)) >> 32);
130 energy = (q31_t) (((((q63_t) in * in) << 1) + (energy << 32)) >> 32);
132 /* Set the accumulator to zero */
135 /* Loop unrolling. Process 4 taps at a time. */
136 tapCnt = numTaps >> 2;
140 /* Perform the multiply-accumulate */
141 acc += ((q63_t) (*px++)) * (*pb++);
142 acc += ((q63_t) (*px++)) * (*pb++);
143 acc += ((q63_t) (*px++)) * (*pb++);
144 acc += ((q63_t) (*px++)) * (*pb++);
146 /* Decrement the loop counter */
150 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
151 tapCnt = numTaps % 0x4u;
155 /* Perform the multiply-accumulate */
156 acc += ((q63_t) (*px++)) * (*pb++);
158 /* Decrement the loop counter */
162 /* Converting the result to 1.31 format */
163 acc = (q31_t) (acc >> shift);
165 /* Store the result from accumulator into the destination buffer. */
166 *pOut++ = (q31_t) acc;
168 /* Compute and store error */
173 /* Calculates the reciprocal of energy */
174 postShift = arm_recip_q31(energy + DELTA_Q31,
175 &oneByEnergy, &S->recipTable[0]);
177 /* Calculation of product of (e * mu) */
178 errorXmu = (q31_t) (((q63_t) e * mu) >> 31);
180 /* Weighting factor for the normalized version */
181 w = clip_q63_to_q31(((q63_t) errorXmu * oneByEnergy) >> (31 - postShift));
183 /* Initialize pState pointer */
186 /* Initialize coeff pointer */
189 /* Loop unrolling. Process 4 taps at a time. */
190 tapCnt = numTaps >> 2;
192 /* Update filter coefficients */
195 /* Perform the multiply-accumulate */
197 /* coef is in 2.30 format */
198 coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
199 /* get coef in 1.31 format by left shifting */
200 *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
201 /* update coefficient buffer to next coefficient */
204 coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
205 *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
208 coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
209 *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
212 coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
213 *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
216 /* Decrement the loop counter */
220 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
221 tapCnt = numTaps % 0x4u;
225 /* Perform the multiply-accumulate */
226 coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
227 *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
230 /* Decrement the loop counter */
234 /* Read the sample from state buffer */
237 /* Advance state pointer by 1 for the next sample */
240 /* Decrement the loop counter */
244 /* Save energy and x0 values for the next frame */
245 S->energy = (q31_t) energy;
248 /* Processing is complete. Now copy the last numTaps - 1 samples to the
249 satrt of the state buffer. This prepares the state buffer for the
250 next function call. */
252 /* Points to the start of the pState buffer */
253 pStateCurnt = S->pState;
255 /* Loop unrolling for (numTaps - 1u) samples copy */
256 tapCnt = (numTaps - 1u) >> 2u;
261 *pStateCurnt++ = *pState++;
262 *pStateCurnt++ = *pState++;
263 *pStateCurnt++ = *pState++;
264 *pStateCurnt++ = *pState++;
266 /* Decrement the loop counter */
270 /* Calculate remaining number of copies */
271 tapCnt = (numTaps - 1u) % 0x4u;
273 /* Copy the remaining q31_t data */
276 *pStateCurnt++ = *pState++;
278 /* Decrement the loop counter */
284 /* Run the below code for Cortex-M0 */
289 /* Copy the new input sample into the state buffer */
290 *pStateCurnt++ = *pSrc;
292 /* Initialize pState pointer */
295 /* Initialize pCoeffs pointer */
298 /* Read the sample from input buffer */
301 /* Update the energy calculation */
303 (q31_t) ((((q63_t) energy << 32) - (((q63_t) x0 * x0) << 1)) >> 32);
304 energy = (q31_t) (((((q63_t) in * in) << 1) + (energy << 32)) >> 32);
306 /* Set the accumulator to zero */
309 /* Loop over numTaps number of values */
314 /* Perform the multiply-accumulate */
315 acc += ((q63_t) (*px++)) * (*pb++);
317 /* Decrement the loop counter */
321 /* Converting the result to 1.31 format */
322 acc = (q31_t) (acc >> shift);
324 /* Store the result from accumulator into the destination buffer. */
325 *pOut++ = (q31_t) acc;
327 /* Compute and store error */
332 /* Calculates the reciprocal of energy */
334 arm_recip_q31(energy + DELTA_Q31, &oneByEnergy, &S->recipTable[0]);
336 /* Calculation of product of (e * mu) */
337 errorXmu = (q31_t) (((q63_t) e * mu) >> 31);
339 /* Weighting factor for the normalized version */
340 w = clip_q63_to_q31(((q63_t) errorXmu * oneByEnergy) >> (31 - postShift));
342 /* Initialize pState pointer */
345 /* Initialize coeff pointer */
348 /* Loop over numTaps number of values */
353 /* Perform the multiply-accumulate */
354 /* coef is in 2.30 format */
355 coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
356 /* get coef in 1.31 format by left shifting */
357 *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
358 /* update coefficient buffer to next coefficient */
361 /* Decrement the loop counter */
365 /* Read the sample from state buffer */
368 /* Advance state pointer by 1 for the next sample */
371 /* Decrement the loop counter */
375 /* Save energy and x0 values for the next frame */
376 S->energy = (q31_t) energy;
379 /* Processing is complete. Now copy the last numTaps - 1 samples to the
380 start of the state buffer. This prepares the state buffer for the
381 next function call. */
383 /* Points to the start of the pState buffer */
384 pStateCurnt = S->pState;
386 /* Loop for (numTaps - 1u) samples copy */
387 tapCnt = (numTaps - 1u);
389 /* Copy the remaining q31_t data */
392 *pStateCurnt++ = *pState++;
394 /* Decrement the loop counter */
398 #endif /* #ifndef ARM_MATH_CM0 */
403 * @} end of LMS_NORM group