1 /* ----------------------------------------------------------------------------
2 * Copyright (C) 2010 ARM Limited. All rights reserved.
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_f32.c
10 * Description: Convolution of floating-point sequences.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Version 1.0.10 2011/7/15
15 * Big Endian support added and Merged M0 and M3/M4 Source code.
17 * Version 1.0.3 2010/11/29
18 * Re-organized the CMSIS folders and updated documentation.
20 * Version 1.0.2 2010/11/11
21 * Documentation updated.
23 * Version 1.0.1 2010/10/05
24 * Production release and review comments incorporated.
26 * Version 1.0.0 2010/09/20
27 * Production release and review comments incorporated
29 * Version 0.0.7 2010/06/10
30 * Misra-C changes done
32 * -------------------------------------------------------------------------- */
37 * @ingroup groupFilters
41 * @defgroup Conv Convolution
43 * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
44 * Convolution is similar to correlation and is frequently used in filtering and data analysis.
45 * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
46 * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3.
49 * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
50 * Then the convolution
58 * \image html ConvolutionEquation.gif
60 * Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>.
61 * <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and
62 * <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
63 * The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result.
66 * Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved,
67 * the signal <code>b[n]</code> slides over <code>a[n]</code>.
68 * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
71 * Note that convolution is a commutative operation:
74 * a[n] * b[n] = b[n] * a[n].
78 * This means that switching the A and B arguments to the convolution functions has no effect.
80 * <b>Fixed-Point Behavior</b>
83 * Convolution requires summing up a large number of intermediate products.
84 * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
85 * Refer to the function specific documentation below for further details of the particular algorithm used.
94 * @brief Convolution of floating-point sequences.
95 * @param[in] *pSrcA points to the first input sequence.
96 * @param[in] srcALen length of the first input sequence.
97 * @param[in] *pSrcB points to the second input sequence.
98 * @param[in] srcBLen length of the second input sequence.
99 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
114 /* Run the below code for Cortex-M4 and Cortex-M3 */
116 float32_t *pIn1; /* inputA pointer */
117 float32_t *pIn2; /* inputB pointer */
118 float32_t *pOut = pDst; /* output pointer */
119 float32_t *px; /* Intermediate inputA pointer */
120 float32_t *py; /* Intermediate inputB pointer */
121 float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
122 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
123 float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
124 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counters */
126 /* The algorithm implementation is based on the lengths of the inputs. */
127 /* srcB is always made to slide across srcA. */
128 /* So srcBLen is always considered as shorter or equal to srcALen */
129 if(srcALen >= srcBLen)
131 /* Initialization of inputA pointer */
134 /* Initialization of inputB pointer */
139 /* Initialization of inputA pointer */
142 /* Initialization of inputB pointer */
145 /* srcBLen is always considered as shorter or equal to srcALen */
151 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
152 /* The function is internally
153 * divided into three stages according to the number of multiplications that has to be
154 * taken place between inputA samples and inputB samples. In the first stage of the
155 * algorithm, the multiplications increase by one for every iteration.
156 * In the second stage of the algorithm, srcBLen number of multiplications are done.
157 * In the third stage of the algorithm, the multiplications decrease by one
158 * for every iteration. */
160 /* The algorithm is implemented in three stages.
161 The loop counters of each stage is initiated here. */
162 blockSize1 = srcBLen - 1u;
163 blockSize2 = srcALen - (srcBLen - 1u);
164 blockSize3 = blockSize1;
166 /* --------------------------
167 * initializations of stage1
168 * -------------------------*/
171 * sum = x[0] * y[1] + x[1] * y[0]
173 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
176 /* In this stage the MAC operations are increased by 1 for every iteration.
177 The count variable holds the number of MAC operations performed */
180 /* Working pointer of inputA */
183 /* Working pointer of inputB */
187 /* ------------------------
189 * ----------------------*/
191 /* The first stage starts here */
192 while(blockSize1 > 0u)
194 /* Accumulator is made zero for every iteration */
197 /* Apply loop unrolling and compute 4 MACs simultaneously. */
200 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
201 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
204 /* x[0] * y[srcBLen - 1] */
205 sum += *px++ * *py--;
207 /* x[1] * y[srcBLen - 2] */
208 sum += *px++ * *py--;
210 /* x[2] * y[srcBLen - 3] */
211 sum += *px++ * *py--;
213 /* x[3] * y[srcBLen - 4] */
214 sum += *px++ * *py--;
216 /* Decrement the loop counter */
220 /* If the count is not a multiple of 4, compute any remaining MACs here.
221 ** No loop unrolling is used. */
226 /* Perform the multiply-accumulate */
227 sum += *px++ * *py--;
229 /* Decrement the loop counter */
233 /* Store the result in the accumulator in the destination buffer. */
236 /* Update the inputA and inputB pointers for next MAC calculation */
240 /* Increment the MAC count */
243 /* Decrement the loop counter */
247 /* --------------------------
248 * Initializations of stage2
249 * ------------------------*/
251 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
252 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
254 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
257 /* Working pointer of inputA */
260 /* Working pointer of inputB */
261 pSrc2 = pIn2 + (srcBLen - 1u);
264 /* count is index by which the pointer pIn1 to be incremented */
267 /* -------------------
269 * ------------------*/
271 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
272 * So, to loop unroll over blockSize2,
273 * srcBLen should be greater than or equal to 4 */
276 /* Loop unroll over blockSize2, by 4 */
277 blkCnt = blockSize2 >> 2u;
281 /* Set all accumulators to zero */
287 /* read x[0], x[1], x[2] samples */
292 /* Apply loop unrolling and compute 4 MACs simultaneously. */
295 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
296 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
299 /* Read y[srcBLen - 1] sample */
302 /* Read x[3] sample */
305 /* Perform the multiply-accumulate */
306 /* acc0 += x[0] * y[srcBLen - 1] */
309 /* acc1 += x[1] * y[srcBLen - 1] */
312 /* acc2 += x[2] * y[srcBLen - 1] */
315 /* acc3 += x[3] * y[srcBLen - 1] */
318 /* Read y[srcBLen - 2] sample */
321 /* Read x[4] sample */
324 /* Perform the multiply-accumulate */
325 /* acc0 += x[1] * y[srcBLen - 2] */
327 /* acc1 += x[2] * y[srcBLen - 2] */
329 /* acc2 += x[3] * y[srcBLen - 2] */
331 /* acc3 += x[4] * y[srcBLen - 2] */
334 /* Read y[srcBLen - 3] sample */
337 /* Read x[5] sample */
340 /* Perform the multiply-accumulates */
341 /* acc0 += x[2] * y[srcBLen - 3] */
343 /* acc1 += x[3] * y[srcBLen - 2] */
345 /* acc2 += x[4] * y[srcBLen - 2] */
347 /* acc3 += x[5] * y[srcBLen - 2] */
350 /* Read y[srcBLen - 4] sample */
353 /* Read x[6] sample */
356 /* Perform the multiply-accumulates */
357 /* acc0 += x[3] * y[srcBLen - 4] */
359 /* acc1 += x[4] * y[srcBLen - 4] */
361 /* acc2 += x[5] * y[srcBLen - 4] */
363 /* acc3 += x[6] * y[srcBLen - 4] */
369 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
370 ** No loop unrolling is used. */
375 /* Read y[srcBLen - 5] sample */
378 /* Read x[7] sample */
381 /* Perform the multiply-accumulates */
382 /* acc0 += x[4] * y[srcBLen - 5] */
384 /* acc1 += x[5] * y[srcBLen - 5] */
386 /* acc2 += x[6] * y[srcBLen - 5] */
388 /* acc3 += x[7] * y[srcBLen - 5] */
391 /* Reuse the present samples for the next MAC */
396 /* Decrement the loop counter */
400 /* Store the result in the accumulator in the destination buffer. */
406 /* Update the inputA and inputB pointers for next MAC calculation */
407 px = pIn1 + (count * 4u);
410 /* Increment the pointer pIn1 index, count by 1 */
413 /* Decrement the loop counter */
417 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
418 ** No loop unrolling is used. */
419 blkCnt = blockSize2 % 0x4u;
423 /* Accumulator is made zero for every iteration */
426 /* Apply loop unrolling and compute 4 MACs simultaneously. */
429 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
430 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
433 /* Perform the multiply-accumulates */
434 sum += *px++ * *py--;
435 sum += *px++ * *py--;
436 sum += *px++ * *py--;
437 sum += *px++ * *py--;
439 /* Decrement the loop counter */
443 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
444 ** No loop unrolling is used. */
449 /* Perform the multiply-accumulate */
450 sum += *px++ * *py--;
452 /* Decrement the loop counter */
456 /* Store the result in the accumulator in the destination buffer. */
459 /* Update the inputA and inputB pointers for next MAC calculation */
463 /* Increment the MAC count */
466 /* Decrement the loop counter */
472 /* If the srcBLen is not a multiple of 4,
473 * the blockSize2 loop cannot be unrolled by 4 */
478 /* Accumulator is made zero for every iteration */
481 /* srcBLen number of MACS should be performed */
486 /* Perform the multiply-accumulate */
487 sum += *px++ * *py--;
489 /* Decrement the loop counter */
493 /* Store the result in the accumulator in the destination buffer. */
496 /* Update the inputA and inputB pointers for next MAC calculation */
500 /* Increment the MAC count */
503 /* Decrement the loop counter */
509 /* --------------------------
510 * Initializations of stage3
511 * -------------------------*/
513 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
514 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
516 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
517 * sum += x[srcALen-1] * y[srcBLen-1]
520 /* In this stage the MAC operations are decreased by 1 for every iteration.
521 The blockSize3 variable holds the number of MAC operations performed */
523 /* Working pointer of inputA */
524 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
527 /* Working pointer of inputB */
528 pSrc2 = pIn2 + (srcBLen - 1u);
531 /* -------------------
533 * ------------------*/
535 while(blockSize3 > 0u)
537 /* Accumulator is made zero for every iteration */
540 /* Apply loop unrolling and compute 4 MACs simultaneously. */
541 k = blockSize3 >> 2u;
543 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
544 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
547 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
548 sum += *px++ * *py--;
550 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
551 sum += *px++ * *py--;
553 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
554 sum += *px++ * *py--;
556 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
557 sum += *px++ * *py--;
559 /* Decrement the loop counter */
563 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
564 ** No loop unrolling is used. */
565 k = blockSize3 % 0x4u;
569 /* Perform the multiply-accumulates */
570 /* sum += x[srcALen-1] * y[srcBLen-1] */
571 sum += *px++ * *py--;
573 /* Decrement the loop counter */
577 /* Store the result in the accumulator in the destination buffer. */
580 /* Update the inputA and inputB pointers for next MAC calculation */
584 /* Decrement the loop counter */
590 /* Run the below code for Cortex-M0 */
592 float32_t *pIn1 = pSrcA; /* inputA pointer */
593 float32_t *pIn2 = pSrcB; /* inputB pointer */
594 float32_t sum; /* Accumulator */
595 uint32_t i, j; /* loop counters */
597 /* Loop to calculate convolution for output length number of times */
598 for (i = 0u; i < ((srcALen + srcBLen) - 1u); i++)
600 /* Initialize sum with zero to carry out MAC operations */
603 /* Loop to perform MAC operations according to convolution equation */
604 for (j = 0u; j <= i; j++)
606 /* Check the array limitations */
607 if((((i - j) < srcBLen) && (j < srcALen)))
609 /* z[i] += x[i-j] * y[j] */
610 sum += pIn1[j] * pIn2[i - j];
613 /* Store the output in the destination buffer */
617 #endif /* #ifndef ARM_MATH_CM0 */
622 * @} end of Conv group