1 /* ----------------------------------------------------------------------------
2 * Copyright (C) 2010 ARM Limited. All rights reserved.
7 * Project: CMSIS DSP Library
8 * Title: arm_correlate_f32.c
10 * Description: Correlation of floating-point sequences.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Version 1.0.10 2011/7/15
15 * Big Endian support added and Merged M0 and M3/M4 Source code.
17 * Version 1.0.3 2010/11/29
18 * Re-organized the CMSIS folders and updated documentation.
20 * Version 1.0.2 2010/11/11
21 * Documentation updated.
23 * Version 1.0.1 2010/10/05
24 * Production release and review comments incorporated.
26 * Version 1.0.0 2010/09/20
27 * Production release and review comments incorporated
29 * Version 0.0.7 2010/06/10
30 * Misra-C changes done
32 * -------------------------------------------------------------------------- */
37 * @ingroup groupFilters
41 * @defgroup Corr Correlation
43 * Correlation is a mathematical operation that is similar to convolution.
44 * As with convolution, correlation uses two signals to produce a third signal.
45 * The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
46 * Correlation is commonly used to measure the similarity between two signals.
47 * It has applications in pattern recognition, cryptanalysis, and searching.
48 * The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
49 * Fast versions of the Q15 and Q31 functions are also provided.
52 * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
53 * The convolution of the two signals is denoted by
57 * In correlation, one of the signals is flipped in time
63 * and this is mathematically defined as
64 * \image html CorrelateEquation.gif
66 * The <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
67 * The result <code>c[n]</code> is of length <code>2 * max(srcALen, srcBLen) - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2)</code>.
68 * The output result is written to <code>pDst</code> and the calling function must allocate <code>2 * max(srcALen, srcBLen) - 1</code> words for the result.
72 * The <code>pDst</code> should be initialized to all zeros before being used.
74 * <b>Fixed-Point Behavior</b>
76 * Correlation requires summing up a large number of intermediate products.
77 * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
78 * Refer to the function specific documentation below for further details of the particular algorithm used.
86 * @brief Correlation of floating-point sequences.
87 * @param[in] *pSrcA points to the first input sequence.
88 * @param[in] srcALen length of the first input sequence.
89 * @param[in] *pSrcB points to the second input sequence.
90 * @param[in] srcBLen length of the second input sequence.
91 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
95 void arm_correlate_f32(
106 /* Run the below code for Cortex-M4 and Cortex-M3 */
108 float32_t *pIn1; /* inputA pointer */
109 float32_t *pIn2; /* inputB pointer */
110 float32_t *pOut = pDst; /* output pointer */
111 float32_t *px; /* Intermediate inputA pointer */
112 float32_t *py; /* Intermediate inputB pointer */
113 float32_t *pSrc1; /* Intermediate pointers */
114 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
115 float32_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
116 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counters */
117 int32_t inc = 1; /* Destination address modifier */
120 /* The algorithm implementation is based on the lengths of the inputs. */
121 /* srcB is always made to slide across srcA. */
122 /* So srcBLen is always considered as shorter or equal to srcALen */
123 /* But CORR(x, y) is reverse of CORR(y, x) */
124 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
125 /* and the destination pointer modifier, inc is set to -1 */
126 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
127 /* But to improve the performance,
128 * we include zeroes in the output instead of zero padding either of the the inputs*/
129 /* If srcALen > srcBLen,
130 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
131 /* If srcALen < srcBLen,
132 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
133 if(srcALen >= srcBLen)
135 /* Initialization of inputA pointer */
138 /* Initialization of inputB pointer */
141 /* Number of output samples is calculated */
142 outBlockSize = (2u * srcALen) - 1u;
144 /* When srcALen > srcBLen, zero padding has to be done to srcB
145 * to make their lengths equal.
146 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
147 * number of output samples are made zero */
148 j = outBlockSize - (srcALen + (srcBLen - 1u));
150 /* Updating the pointer position to non zero value */
155 // /* Zero is stored in the destination buffer */
158 // /* Decrement the loop counter */
165 /* Initialization of inputA pointer */
168 /* Initialization of inputB pointer */
171 /* srcBLen is always considered as shorter or equal to srcALen */
176 /* CORR(x, y) = Reverse order(CORR(y, x)) */
177 /* Hence set the destination pointer to point to the last output sample */
178 pOut = pDst + ((srcALen + srcBLen) - 2u);
180 /* Destination address modifier is set to -1 */
185 /* The function is internally
186 * divided into three parts according to the number of multiplications that has to be
187 * taken place between inputA samples and inputB samples. In the first part of the
188 * algorithm, the multiplications increase by one for every iteration.
189 * In the second part of the algorithm, srcBLen number of multiplications are done.
190 * In the third part of the algorithm, the multiplications decrease by one
191 * for every iteration.*/
192 /* The algorithm is implemented in three stages.
193 * The loop counters of each stage is initiated here. */
194 blockSize1 = srcBLen - 1u;
195 blockSize2 = srcALen - (srcBLen - 1u);
196 blockSize3 = blockSize1;
198 /* --------------------------
199 * Initializations of stage1
200 * -------------------------*/
202 /* sum = x[0] * y[srcBlen - 1]
203 * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
205 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
208 /* In this stage the MAC operations are increased by 1 for every iteration.
209 The count variable holds the number of MAC operations performed */
212 /* Working pointer of inputA */
215 /* Working pointer of inputB */
216 pSrc1 = pIn2 + (srcBLen - 1u);
219 /* ------------------------
221 * ----------------------*/
223 /* The first stage starts here */
224 while(blockSize1 > 0u)
226 /* Accumulator is made zero for every iteration */
229 /* Apply loop unrolling and compute 4 MACs simultaneously. */
232 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
233 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
236 /* x[0] * y[srcBLen - 4] */
237 sum += *px++ * *py++;
238 /* x[1] * y[srcBLen - 3] */
239 sum += *px++ * *py++;
240 /* x[2] * y[srcBLen - 2] */
241 sum += *px++ * *py++;
242 /* x[3] * y[srcBLen - 1] */
243 sum += *px++ * *py++;
245 /* Decrement the loop counter */
249 /* If the count is not a multiple of 4, compute any remaining MACs here.
250 ** No loop unrolling is used. */
255 /* Perform the multiply-accumulate */
256 /* x[0] * y[srcBLen - 1] */
257 sum += *px++ * *py++;
259 /* Decrement the loop counter */
263 /* Store the result in the accumulator in the destination buffer. */
265 /* Destination pointer is updated according to the address modifier, inc */
268 /* Update the inputA and inputB pointers for next MAC calculation */
272 /* Increment the MAC count */
275 /* Decrement the loop counter */
279 /* --------------------------
280 * Initializations of stage2
281 * ------------------------*/
283 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
284 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
286 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
289 /* Working pointer of inputA */
292 /* Working pointer of inputB */
295 /* count is index by which the pointer pIn1 to be incremented */
298 /* -------------------
300 * ------------------*/
302 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
303 * So, to loop unroll over blockSize2,
304 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
307 /* Loop unroll over blockSize2, by 4 */
308 blkCnt = blockSize2 >> 2u;
312 /* Set all accumulators to zero */
318 /* read x[0], x[1], x[2] samples */
323 /* Apply loop unrolling and compute 4 MACs simultaneously. */
326 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
327 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
330 /* Read y[0] sample */
333 /* Read x[3] sample */
336 /* Perform the multiply-accumulate */
337 /* acc0 += x[0] * y[0] */
339 /* acc1 += x[1] * y[0] */
341 /* acc2 += x[2] * y[0] */
343 /* acc3 += x[3] * y[0] */
346 /* Read y[1] sample */
349 /* Read x[4] sample */
352 /* Perform the multiply-accumulate */
353 /* acc0 += x[1] * y[1] */
355 /* acc1 += x[2] * y[1] */
357 /* acc2 += x[3] * y[1] */
359 /* acc3 += x[4] * y[1] */
362 /* Read y[2] sample */
365 /* Read x[5] sample */
368 /* Perform the multiply-accumulates */
369 /* acc0 += x[2] * y[2] */
371 /* acc1 += x[3] * y[2] */
373 /* acc2 += x[4] * y[2] */
375 /* acc3 += x[5] * y[2] */
378 /* Read y[3] sample */
381 /* Read x[6] sample */
384 /* Perform the multiply-accumulates */
385 /* acc0 += x[3] * y[3] */
387 /* acc1 += x[4] * y[3] */
389 /* acc2 += x[5] * y[3] */
391 /* acc3 += x[6] * y[3] */
397 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
398 ** No loop unrolling is used. */
403 /* Read y[4] sample */
406 /* Read x[7] sample */
409 /* Perform the multiply-accumulates */
410 /* acc0 += x[4] * y[4] */
412 /* acc1 += x[5] * y[4] */
414 /* acc2 += x[6] * y[4] */
416 /* acc3 += x[7] * y[4] */
419 /* Reuse the present samples for the next MAC */
424 /* Decrement the loop counter */
428 /* Store the result in the accumulator in the destination buffer. */
430 /* Destination pointer is updated according to the address modifier, inc */
442 /* Update the inputA and inputB pointers for next MAC calculation */
443 px = pIn1 + (count * 4u);
446 /* Increment the pointer pIn1 index, count by 1 */
449 /* Decrement the loop counter */
453 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
454 ** No loop unrolling is used. */
455 blkCnt = blockSize2 % 0x4u;
459 /* Accumulator is made zero for every iteration */
462 /* Apply loop unrolling and compute 4 MACs simultaneously. */
465 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
466 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
469 /* Perform the multiply-accumulates */
470 sum += *px++ * *py++;
471 sum += *px++ * *py++;
472 sum += *px++ * *py++;
473 sum += *px++ * *py++;
475 /* Decrement the loop counter */
479 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
480 ** No loop unrolling is used. */
485 /* Perform the multiply-accumulate */
486 sum += *px++ * *py++;
488 /* Decrement the loop counter */
492 /* Store the result in the accumulator in the destination buffer. */
494 /* Destination pointer is updated according to the address modifier, inc */
497 /* Update the inputA and inputB pointers for next MAC calculation */
501 /* Increment the pointer pIn1 index, count by 1 */
504 /* Decrement the loop counter */
510 /* If the srcBLen is not a multiple of 4,
511 * the blockSize2 loop cannot be unrolled by 4 */
516 /* Accumulator is made zero for every iteration */
519 /* Loop over srcBLen */
524 /* Perform the multiply-accumulate */
525 sum += *px++ * *py++;
527 /* Decrement the loop counter */
531 /* Store the result in the accumulator in the destination buffer. */
533 /* Destination pointer is updated according to the address modifier, inc */
536 /* Update the inputA and inputB pointers for next MAC calculation */
540 /* Increment the pointer pIn1 index, count by 1 */
543 /* Decrement the loop counter */
548 /* --------------------------
549 * Initializations of stage3
550 * -------------------------*/
552 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
553 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
555 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
556 * sum += x[srcALen-1] * y[0]
559 /* In this stage the MAC operations are decreased by 1 for every iteration.
560 The count variable holds the number of MAC operations performed */
561 count = srcBLen - 1u;
563 /* Working pointer of inputA */
564 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
567 /* Working pointer of inputB */
570 /* -------------------
572 * ------------------*/
574 while(blockSize3 > 0u)
576 /* Accumulator is made zero for every iteration */
579 /* Apply loop unrolling and compute 4 MACs simultaneously. */
582 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
583 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
586 /* Perform the multiply-accumulates */
587 /* sum += x[srcALen - srcBLen + 4] * y[3] */
588 sum += *px++ * *py++;
589 /* sum += x[srcALen - srcBLen + 3] * y[2] */
590 sum += *px++ * *py++;
591 /* sum += x[srcALen - srcBLen + 2] * y[1] */
592 sum += *px++ * *py++;
593 /* sum += x[srcALen - srcBLen + 1] * y[0] */
594 sum += *px++ * *py++;
596 /* Decrement the loop counter */
600 /* If the count is not a multiple of 4, compute any remaining MACs here.
601 ** No loop unrolling is used. */
606 /* Perform the multiply-accumulates */
607 sum += *px++ * *py++;
609 /* Decrement the loop counter */
613 /* Store the result in the accumulator in the destination buffer. */
615 /* Destination pointer is updated according to the address modifier, inc */
618 /* Update the inputA and inputB pointers for next MAC calculation */
622 /* Decrement the MAC count */
625 /* Decrement the loop counter */
631 /* Run the below code for Cortex-M0 */
633 float32_t *pIn1 = pSrcA; /* inputA pointer */
634 float32_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */
635 float32_t sum; /* Accumulator */
636 uint32_t i = 0u, j; /* loop counters */
637 uint32_t inv = 0u; /* Reverse order flag */
638 uint32_t tot = 0u; /* Length */
640 /* The algorithm implementation is based on the lengths of the inputs. */
641 /* srcB is always made to slide across srcA. */
642 /* So srcBLen is always considered as shorter or equal to srcALen */
643 /* But CORR(x, y) is reverse of CORR(y, x) */
644 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
645 /* and a varaible, inv is set to 1 */
646 /* If lengths are not equal then zero pad has to be done to make the two
647 * inputs of same length. But to improve the performance, we include zeroes
648 * in the output instead of zero padding either of the the inputs*/
649 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
650 * starting of the output buffer */
651 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
652 * ending of the output buffer */
653 /* Once the zero padding is done the remaining of the output is calcualted
654 * using convolution but with the shorter signal time shifted. */
656 /* Calculate the length of the remaining sequence */
657 tot = ((srcALen + srcBLen) - 2u);
659 if(srcALen > srcBLen)
661 /* Calculating the number of zeros to be padded to the output */
662 j = srcALen - srcBLen;
664 /* Initialise the pointer after zero padding */
668 else if(srcALen < srcBLen)
670 /* Initialization to inputB pointer */
673 /* Initialization to the end of inputA pointer */
674 pIn2 = pSrcA + (srcALen - 1u);
676 /* Initialisation of the pointer after zero padding */
679 /* Swapping the lengths */
684 /* Setting the reverse flag */
689 /* Loop to calculate convolution for output length number of times */
690 for (i = 0u; i <= tot; i++)
692 /* Initialize sum with zero to carry on MAC operations */
695 /* Loop to perform MAC operations according to convolution equation */
696 for (j = 0u; j <= i; j++)
698 /* Check the array limitations */
699 if((((i - j) < srcBLen) && (j < srcALen)))
701 /* z[i] += x[i-j] * y[j] */
702 sum += pIn1[j] * pIn2[-((int32_t) i - j)];
705 /* Store the output in the destination buffer */
712 #endif /* #ifndef ARM_MATH_CM0 */
717 * @} end of Corr group