1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010 ARM Limited. All rights reserved.
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_fast_q31.c
10 * Description: Q31 Convolution (fast version).
12 * Target Processor: Cortex-M4/Cortex-M3
14 * Version 1.0.10 2011/7/15
15 * Big Endian support added and Merged M0 and M3/M4 Source code.
17 * Version 1.0.3 2010/11/29
18 * Re-organized the CMSIS folders and updated documentation.
20 * Version 1.0.2 2010/11/11
21 * Documentation updated.
23 * Version 1.0.1 2010/10/05
24 * Production release and review comments incorporated.
26 * Version 1.0.0 2010/09/20
27 * Production release and review comments incorporated.
28 * -------------------------------------------------------------------- */
33 * @ingroup groupFilters
42 * @param[in] *pSrcA points to the first input sequence.
43 * @param[in] srcALen length of the first input sequence.
44 * @param[in] *pSrcB points to the second input sequence.
45 * @param[in] srcBLen length of the second input sequence.
46 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
50 * <b>Scaling and Overflow Behavior:</b>
53 * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
54 * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
55 * These intermediate results are accumulated in a 32-bit register in 2.30 format.
56 * Finally, the accumulator is saturated and converted to a 1.31 result.
59 * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
60 * In order to avoid overflows completely the input signals must be scaled down.
61 * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
62 * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
65 * See <code>arm_conv_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
68 void arm_conv_fast_q31(
75 q31_t *pIn1; /* inputA pointer */
76 q31_t *pIn2; /* inputB pointer */
77 q31_t *pOut = pDst; /* output pointer */
78 q31_t *px; /* Intermediate inputA pointer */
79 q31_t *py; /* Intermediate inputB pointer */
80 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
81 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
82 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
83 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
86 /* The algorithm implementation is based on the lengths of the inputs. */
87 /* srcB is always made to slide across srcA. */
88 /* So srcBLen is always considered as shorter or equal to srcALen */
89 if(srcALen >= srcBLen)
91 /* Initialization of inputA pointer */
94 /* Initialization of inputB pointer */
99 /* Initialization of inputA pointer */
102 /* Initialization of inputB pointer */
105 /* srcBLen is always considered as shorter or equal to srcALen */
111 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
112 /* The function is internally
113 * divided into three stages according to the number of multiplications that has to be
114 * taken place between inputA samples and inputB samples. In the first stage of the
115 * algorithm, the multiplications increase by one for every iteration.
116 * In the second stage of the algorithm, srcBLen number of multiplications are done.
117 * In the third stage of the algorithm, the multiplications decrease by one
118 * for every iteration. */
120 /* The algorithm is implemented in three stages.
121 The loop counters of each stage is initiated here. */
122 blockSize1 = srcBLen - 1u;
123 blockSize2 = srcALen - (srcBLen - 1u);
124 blockSize3 = blockSize1;
126 /* --------------------------
127 * Initializations of stage1
128 * -------------------------*/
131 * sum = x[0] * y[1] + x[1] * y[0]
133 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
136 /* In this stage the MAC operations are increased by 1 for every iteration.
137 The count variable holds the number of MAC operations performed */
140 /* Working pointer of inputA */
143 /* Working pointer of inputB */
147 /* ------------------------
149 * ----------------------*/
151 /* The first stage starts here */
152 while(blockSize1 > 0u)
154 /* Accumulator is made zero for every iteration */
157 /* Apply loop unrolling and compute 4 MACs simultaneously. */
160 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
161 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
164 /* x[0] * y[srcBLen - 1] */
165 sum = (q31_t) ((((q63_t) sum << 32) +
166 ((q63_t) * px++ * (*py--))) >> 32);
168 /* x[1] * y[srcBLen - 2] */
169 sum = (q31_t) ((((q63_t) sum << 32) +
170 ((q63_t) * px++ * (*py--))) >> 32);
172 /* x[2] * y[srcBLen - 3] */
173 sum = (q31_t) ((((q63_t) sum << 32) +
174 ((q63_t) * px++ * (*py--))) >> 32);
176 /* x[3] * y[srcBLen - 4] */
177 sum = (q31_t) ((((q63_t) sum << 32) +
178 ((q63_t) * px++ * (*py--))) >> 32);
180 /* Decrement the loop counter */
184 /* If the count is not a multiple of 4, compute any remaining MACs here.
185 ** No loop unrolling is used. */
190 /* Perform the multiply-accumulate */
191 sum = (q31_t) ((((q63_t) sum << 32) +
192 ((q63_t) * px++ * (*py--))) >> 32);
194 /* Decrement the loop counter */
198 /* Store the result in the accumulator in the destination buffer. */
201 /* Update the inputA and inputB pointers for next MAC calculation */
205 /* Increment the MAC count */
208 /* Decrement the loop counter */
212 /* --------------------------
213 * Initializations of stage2
214 * ------------------------*/
216 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
217 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
219 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
222 /* Working pointer of inputA */
225 /* Working pointer of inputB */
226 pSrc2 = pIn2 + (srcBLen - 1u);
229 /* count is index by which the pointer pIn1 to be incremented */
232 /* -------------------
234 * ------------------*/
236 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
237 * So, to loop unroll over blockSize2,
238 * srcBLen should be greater than or equal to 4 */
241 /* Loop unroll over blockSize2, by 4 */
242 blkCnt = blockSize2 >> 2u;
246 /* Set all accumulators to zero */
252 /* read x[0], x[1], x[2] samples */
257 /* Apply loop unrolling and compute 4 MACs simultaneously. */
260 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
261 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
264 /* Read y[srcBLen - 1] sample */
267 /* Read x[3] sample */
270 /* Perform the multiply-accumulates */
271 /* acc0 += x[0] * y[srcBLen - 1] */
272 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
274 /* acc1 += x[1] * y[srcBLen - 1] */
275 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
277 /* acc2 += x[2] * y[srcBLen - 1] */
278 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
280 /* acc3 += x[3] * y[srcBLen - 1] */
281 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
283 /* Read y[srcBLen - 2] sample */
286 /* Read x[4] sample */
289 /* Perform the multiply-accumulate */
290 /* acc0 += x[1] * y[srcBLen - 2] */
291 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
292 /* acc1 += x[2] * y[srcBLen - 2] */
293 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
294 /* acc2 += x[3] * y[srcBLen - 2] */
295 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
296 /* acc3 += x[4] * y[srcBLen - 2] */
297 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
299 /* Read y[srcBLen - 3] sample */
302 /* Read x[5] sample */
305 /* Perform the multiply-accumulates */
306 /* acc0 += x[2] * y[srcBLen - 3] */
307 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
308 /* acc1 += x[3] * y[srcBLen - 2] */
309 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
310 /* acc2 += x[4] * y[srcBLen - 2] */
311 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
312 /* acc3 += x[5] * y[srcBLen - 2] */
313 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
315 /* Read y[srcBLen - 4] sample */
318 /* Read x[6] sample */
321 /* Perform the multiply-accumulates */
322 /* acc0 += x[3] * y[srcBLen - 4] */
323 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
324 /* acc1 += x[4] * y[srcBLen - 4] */
325 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
326 /* acc2 += x[5] * y[srcBLen - 4] */
327 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
328 /* acc3 += x[6] * y[srcBLen - 4] */
329 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
334 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
335 ** No loop unrolling is used. */
340 /* Read y[srcBLen - 5] sample */
343 /* Read x[7] sample */
346 /* Perform the multiply-accumulates */
347 /* acc0 += x[4] * y[srcBLen - 5] */
348 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
349 /* acc1 += x[5] * y[srcBLen - 5] */
350 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
351 /* acc2 += x[6] * y[srcBLen - 5] */
352 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
353 /* acc3 += x[7] * y[srcBLen - 5] */
354 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
356 /* Reuse the present samples for the next MAC */
361 /* Decrement the loop counter */
365 /* Store the results in the accumulators in the destination buffer. */
366 *pOut++ = (q31_t) (acc0 << 1);
367 *pOut++ = (q31_t) (acc1 << 1);
368 *pOut++ = (q31_t) (acc2 << 1);
369 *pOut++ = (q31_t) (acc3 << 1);
371 /* Update the inputA and inputB pointers for next MAC calculation */
372 px = pIn1 + (count * 4u);
375 /* Increment the pointer pIn1 index, count by 1 */
378 /* Decrement the loop counter */
382 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
383 ** No loop unrolling is used. */
384 blkCnt = blockSize2 % 0x4u;
388 /* Accumulator is made zero for every iteration */
391 /* Apply loop unrolling and compute 4 MACs simultaneously. */
394 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
395 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
398 /* Perform the multiply-accumulates */
399 sum = (q31_t) ((((q63_t) sum << 32) +
400 ((q63_t) * px++ * (*py--))) >> 32);
401 sum = (q31_t) ((((q63_t) sum << 32) +
402 ((q63_t) * px++ * (*py--))) >> 32);
403 sum = (q31_t) ((((q63_t) sum << 32) +
404 ((q63_t) * px++ * (*py--))) >> 32);
405 sum = (q31_t) ((((q63_t) sum << 32) +
406 ((q63_t) * px++ * (*py--))) >> 32);
408 /* Decrement the loop counter */
412 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
413 ** No loop unrolling is used. */
418 /* Perform the multiply-accumulate */
419 sum = (q31_t) ((((q63_t) sum << 32) +
420 ((q63_t) * px++ * (*py--))) >> 32);
422 /* Decrement the loop counter */
426 /* Store the result in the accumulator in the destination buffer. */
429 /* Update the inputA and inputB pointers for next MAC calculation */
433 /* Increment the MAC count */
436 /* Decrement the loop counter */
442 /* If the srcBLen is not a multiple of 4,
443 * the blockSize2 loop cannot be unrolled by 4 */
448 /* Accumulator is made zero for every iteration */
451 /* srcBLen number of MACS should be performed */
456 /* Perform the multiply-accumulate */
457 sum = (q31_t) ((((q63_t) sum << 32) +
458 ((q63_t) * px++ * (*py--))) >> 32);
460 /* Decrement the loop counter */
464 /* Store the result in the accumulator in the destination buffer. */
467 /* Update the inputA and inputB pointers for next MAC calculation */
471 /* Increment the MAC count */
474 /* Decrement the loop counter */
480 /* --------------------------
481 * Initializations of stage3
482 * -------------------------*/
484 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
485 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
487 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
488 * sum += x[srcALen-1] * y[srcBLen-1]
491 /* In this stage the MAC operations are decreased by 1 for every iteration.
492 The blockSize3 variable holds the number of MAC operations performed */
494 /* Working pointer of inputA */
495 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
498 /* Working pointer of inputB */
499 pSrc2 = pIn2 + (srcBLen - 1u);
502 /* -------------------
504 * ------------------*/
506 while(blockSize3 > 0u)
508 /* Accumulator is made zero for every iteration */
511 /* Apply loop unrolling and compute 4 MACs simultaneously. */
512 k = blockSize3 >> 2u;
514 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
515 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
518 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
519 sum = (q31_t) ((((q63_t) sum << 32) +
520 ((q63_t) * px++ * (*py--))) >> 32);
522 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
523 sum = (q31_t) ((((q63_t) sum << 32) +
524 ((q63_t) * px++ * (*py--))) >> 32);
526 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
527 sum = (q31_t) ((((q63_t) sum << 32) +
528 ((q63_t) * px++ * (*py--))) >> 32);
530 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
531 sum = (q31_t) ((((q63_t) sum << 32) +
532 ((q63_t) * px++ * (*py--))) >> 32);
534 /* Decrement the loop counter */
538 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
539 ** No loop unrolling is used. */
540 k = blockSize3 % 0x4u;
544 /* Perform the multiply-accumulate */
545 sum = (q31_t) ((((q63_t) sum << 32) +
546 ((q63_t) * px++ * (*py--))) >> 32);
548 /* Decrement the loop counter */
552 /* Store the result in the accumulator in the destination buffer. */
555 /* Update the inputA and inputB pointers for next MAC calculation */
559 /* Decrement the loop counter */
566 * @} end of Conv group