1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010 ARM Limited. All rights reserved.
7 * Project: CMSIS DSP Library
8 * Title: arm_correlate_fast_q31.c
10 * Description: Fast Q31 Correlation.
12 * Target Processor: Cortex-M4/Cortex-M3
14 * Version 1.0.10 2011/7/15
15 * Big Endian support added and Merged M0 and M3/M4 Source code.
17 * Version 1.0.3 2010/11/29
18 * Re-organized the CMSIS folders and updated documentation.
20 * Version 1.0.2 2010/11/11
21 * Documentation updated.
23 * Version 1.0.1 2010/10/05
24 * Production release and review comments incorporated.
26 * Version 1.0.0 2010/09/20
27 * Production release and review comments incorporated.
28 * -------------------------------------------------------------------- */
33 * @ingroup groupFilters
42 * @brief Correlation of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.
43 * @param[in] *pSrcA points to the first input sequence.
44 * @param[in] srcALen length of the first input sequence.
45 * @param[in] *pSrcB points to the second input sequence.
46 * @param[in] srcBLen length of the second input sequence.
47 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
51 * <b>Scaling and Overflow Behavior:</b>
54 * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
55 * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
56 * These intermediate results are accumulated in a 32-bit register in 2.30 format.
57 * Finally, the accumulator is saturated and converted to a 1.31 result.
60 * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
61 * In order to avoid overflows completely the input signals must be scaled down.
62 * The input signals should be scaled down to avoid intermediate overflows.
63 * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
64 * maximum of min(srcALen, srcBLen) number of additions is carried internally.
67 * See <code>arm_correlate_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
70 void arm_correlate_fast_q31(
77 q31_t *pIn1; /* inputA pointer */
78 q31_t *pIn2; /* inputB pointer */
79 q31_t *pOut = pDst; /* output pointer */
80 q31_t *px; /* Intermediate inputA pointer */
81 q31_t *py; /* Intermediate inputB pointer */
82 q31_t *pSrc1; /* Intermediate pointers */
83 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
84 q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
85 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
86 int32_t inc = 1; /* Destination address modifier */
89 /* The algorithm implementation is based on the lengths of the inputs. */
90 /* srcB is always made to slide across srcA. */
91 /* So srcBLen is always considered as shorter or equal to srcALen */
92 if(srcALen >= srcBLen)
94 /* Initialization of inputA pointer */
97 /* Initialization of inputB pointer */
100 /* Number of output samples is calculated */
101 outBlockSize = (2u * srcALen) - 1u;
103 /* When srcALen > srcBLen, zero padding is done to srcB
104 * to make their lengths equal.
105 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
106 * number of output samples are made zero */
107 j = outBlockSize - (srcALen + (srcBLen - 1u));
109 /* Updating the pointer position to non zero value */
115 /* Initialization of inputA pointer */
118 /* Initialization of inputB pointer */
121 /* srcBLen is always considered as shorter or equal to srcALen */
126 /* CORR(x, y) = Reverse order(CORR(y, x)) */
127 /* Hence set the destination pointer to point to the last output sample */
128 pOut = pDst + ((srcALen + srcBLen) - 2u);
130 /* Destination address modifier is set to -1 */
135 /* The function is internally
136 * divided into three parts according to the number of multiplications that has to be
137 * taken place between inputA samples and inputB samples. In the first part of the
138 * algorithm, the multiplications increase by one for every iteration.
139 * In the second part of the algorithm, srcBLen number of multiplications are done.
140 * In the third part of the algorithm, the multiplications decrease by one
141 * for every iteration.*/
142 /* The algorithm is implemented in three stages.
143 * The loop counters of each stage is initiated here. */
144 blockSize1 = srcBLen - 1u;
145 blockSize2 = srcALen - (srcBLen - 1u);
146 blockSize3 = blockSize1;
148 /* --------------------------
149 * Initializations of stage1
150 * -------------------------*/
152 /* sum = x[0] * y[srcBlen - 1]
153 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
155 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
158 /* In this stage the MAC operations are increased by 1 for every iteration.
159 The count variable holds the number of MAC operations performed */
162 /* Working pointer of inputA */
165 /* Working pointer of inputB */
166 pSrc1 = pIn2 + (srcBLen - 1u);
169 /* ------------------------
171 * ----------------------*/
173 /* The first stage starts here */
174 while(blockSize1 > 0u)
176 /* Accumulator is made zero for every iteration */
179 /* Apply loop unrolling and compute 4 MACs simultaneously. */
182 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
183 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
186 /* x[0] * y[srcBLen - 4] */
187 sum = (q31_t) ((((q63_t) sum << 32) +
188 ((q63_t) * px++ * (*py++))) >> 32);
189 /* x[1] * y[srcBLen - 3] */
190 sum = (q31_t) ((((q63_t) sum << 32) +
191 ((q63_t) * px++ * (*py++))) >> 32);
192 /* x[2] * y[srcBLen - 2] */
193 sum = (q31_t) ((((q63_t) sum << 32) +
194 ((q63_t) * px++ * (*py++))) >> 32);
195 /* x[3] * y[srcBLen - 1] */
196 sum = (q31_t) ((((q63_t) sum << 32) +
197 ((q63_t) * px++ * (*py++))) >> 32);
199 /* Decrement the loop counter */
203 /* If the count is not a multiple of 4, compute any remaining MACs here.
204 ** No loop unrolling is used. */
209 /* Perform the multiply-accumulates */
210 /* x[0] * y[srcBLen - 1] */
211 sum = (q31_t) ((((q63_t) sum << 32) +
212 ((q63_t) * px++ * (*py++))) >> 32);
214 /* Decrement the loop counter */
218 /* Store the result in the accumulator in the destination buffer. */
220 /* Destination pointer is updated according to the address modifier, inc */
223 /* Update the inputA and inputB pointers for next MAC calculation */
227 /* Increment the MAC count */
230 /* Decrement the loop counter */
234 /* --------------------------
235 * Initializations of stage2
236 * ------------------------*/
238 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
239 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
241 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
244 /* Working pointer of inputA */
247 /* Working pointer of inputB */
250 /* count is index by which the pointer pIn1 to be incremented */
253 /* -------------------
255 * ------------------*/
257 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
258 * So, to loop unroll over blockSize2,
259 * srcBLen should be greater than or equal to 4 */
262 /* Loop unroll over blockSize2, by 4 */
263 blkCnt = blockSize2 >> 2u;
267 /* Set all accumulators to zero */
273 /* read x[0], x[1], x[2] samples */
278 /* Apply loop unrolling and compute 4 MACs simultaneously. */
281 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
282 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
285 /* Read y[0] sample */
288 /* Read x[3] sample */
291 /* Perform the multiply-accumulate */
292 /* acc0 += x[0] * y[0] */
293 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
294 /* acc1 += x[1] * y[0] */
295 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
296 /* acc2 += x[2] * y[0] */
297 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
298 /* acc3 += x[3] * y[0] */
299 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
301 /* Read y[1] sample */
304 /* Read x[4] sample */
307 /* Perform the multiply-accumulates */
308 /* acc0 += x[1] * y[1] */
309 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
310 /* acc1 += x[2] * y[1] */
311 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
312 /* acc2 += x[3] * y[1] */
313 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
314 /* acc3 += x[4] * y[1] */
315 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
317 /* Read y[2] sample */
320 /* Read x[5] sample */
323 /* Perform the multiply-accumulates */
324 /* acc0 += x[2] * y[2] */
325 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
326 /* acc1 += x[3] * y[2] */
327 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
328 /* acc2 += x[4] * y[2] */
329 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
330 /* acc3 += x[5] * y[2] */
331 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
333 /* Read y[3] sample */
336 /* Read x[6] sample */
339 /* Perform the multiply-accumulates */
340 /* acc0 += x[3] * y[3] */
341 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
342 /* acc1 += x[4] * y[3] */
343 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
344 /* acc2 += x[5] * y[3] */
345 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
346 /* acc3 += x[6] * y[3] */
347 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
352 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
353 ** No loop unrolling is used. */
358 /* Read y[4] sample */
361 /* Read x[7] sample */
364 /* Perform the multiply-accumulates */
365 /* acc0 += x[4] * y[4] */
366 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
367 /* acc1 += x[5] * y[4] */
368 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
369 /* acc2 += x[6] * y[4] */
370 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
371 /* acc3 += x[7] * y[4] */
372 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
374 /* Reuse the present samples for the next MAC */
379 /* Decrement the loop counter */
383 /* Store the result in the accumulator in the destination buffer. */
384 *pOut = (q31_t) (acc0 << 1);
385 /* Destination pointer is updated according to the address modifier, inc */
388 *pOut = (q31_t) (acc1 << 1);
391 *pOut = (q31_t) (acc2 << 1);
394 *pOut = (q31_t) (acc3 << 1);
397 /* Update the inputA and inputB pointers for next MAC calculation */
398 px = pIn1 + (count * 4u);
401 /* Increment the pointer pIn1 index, count by 1 */
404 /* Decrement the loop counter */
408 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
409 ** No loop unrolling is used. */
410 blkCnt = blockSize2 % 0x4u;
414 /* Accumulator is made zero for every iteration */
417 /* Apply loop unrolling and compute 4 MACs simultaneously. */
420 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
421 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
424 /* Perform the multiply-accumulates */
425 sum = (q31_t) ((((q63_t) sum << 32) +
426 ((q63_t) * px++ * (*py++))) >> 32);
427 sum = (q31_t) ((((q63_t) sum << 32) +
428 ((q63_t) * px++ * (*py++))) >> 32);
429 sum = (q31_t) ((((q63_t) sum << 32) +
430 ((q63_t) * px++ * (*py++))) >> 32);
431 sum = (q31_t) ((((q63_t) sum << 32) +
432 ((q63_t) * px++ * (*py++))) >> 32);
434 /* Decrement the loop counter */
438 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
439 ** No loop unrolling is used. */
444 /* Perform the multiply-accumulate */
445 sum = (q31_t) ((((q63_t) sum << 32) +
446 ((q63_t) * px++ * (*py++))) >> 32);
448 /* Decrement the loop counter */
452 /* Store the result in the accumulator in the destination buffer. */
454 /* Destination pointer is updated according to the address modifier, inc */
457 /* Update the inputA and inputB pointers for next MAC calculation */
461 /* Increment the MAC count */
464 /* Decrement the loop counter */
470 /* If the srcBLen is not a multiple of 4,
471 * the blockSize2 loop cannot be unrolled by 4 */
476 /* Accumulator is made zero for every iteration */
479 /* Loop over srcBLen */
484 /* Perform the multiply-accumulate */
485 sum = (q31_t) ((((q63_t) sum << 32) +
486 ((q63_t) * px++ * (*py++))) >> 32);
488 /* Decrement the loop counter */
492 /* Store the result in the accumulator in the destination buffer. */
494 /* Destination pointer is updated according to the address modifier, inc */
497 /* Update the inputA and inputB pointers for next MAC calculation */
501 /* Increment the MAC count */
504 /* Decrement the loop counter */
509 /* --------------------------
510 * Initializations of stage3
511 * -------------------------*/
513 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
514 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
516 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
517 * sum += x[srcALen-1] * y[0]
520 /* In this stage the MAC operations are decreased by 1 for every iteration.
521 The count variable holds the number of MAC operations performed */
522 count = srcBLen - 1u;
524 /* Working pointer of inputA */
525 pSrc1 = ((pIn1 + srcALen) - srcBLen) + 1u;
528 /* Working pointer of inputB */
531 /* -------------------
533 * ------------------*/
535 while(blockSize3 > 0u)
537 /* Accumulator is made zero for every iteration */
540 /* Apply loop unrolling and compute 4 MACs simultaneously. */
543 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
544 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
547 /* Perform the multiply-accumulates */
548 /* sum += x[srcALen - srcBLen + 4] * y[3] */
549 sum = (q31_t) ((((q63_t) sum << 32) +
550 ((q63_t) * px++ * (*py++))) >> 32);
551 /* sum += x[srcALen - srcBLen + 3] * y[2] */
552 sum = (q31_t) ((((q63_t) sum << 32) +
553 ((q63_t) * px++ * (*py++))) >> 32);
554 /* sum += x[srcALen - srcBLen + 2] * y[1] */
555 sum = (q31_t) ((((q63_t) sum << 32) +
556 ((q63_t) * px++ * (*py++))) >> 32);
557 /* sum += x[srcALen - srcBLen + 1] * y[0] */
558 sum = (q31_t) ((((q63_t) sum << 32) +
559 ((q63_t) * px++ * (*py++))) >> 32);
561 /* Decrement the loop counter */
565 /* If the count is not a multiple of 4, compute any remaining MACs here.
566 ** No loop unrolling is used. */
571 /* Perform the multiply-accumulates */
572 sum = (q31_t) ((((q63_t) sum << 32) +
573 ((q63_t) * px++ * (*py++))) >> 32);
575 /* Decrement the loop counter */
579 /* Store the result in the accumulator in the destination buffer. */
581 /* Destination pointer is updated according to the address modifier, inc */
584 /* Update the inputA and inputB pointers for next MAC calculation */
588 /* Decrement the MAC count */
591 /* Decrement the loop counter */
598 * @} end of Corr group