00001 /* ---------------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_correlate_f32.c 00009 * 00010 * Description: Correlation of floating-point sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated 00028 * 00029 * Version 0.0.7 2010/06/10 00030 * Misra-C changes done 00031 * 00032 * -------------------------------------------------------------------------- */ 00033 00034 #include "arm_math.h" 00035 00095 void arm_correlate_f32( 00096 float32_t * pSrcA, 00097 uint32_t srcALen, 00098 float32_t * pSrcB, 00099 uint32_t srcBLen, 00100 float32_t * pDst) 00101 { 00102 00103 00104 #ifndef ARM_MATH_CM0 00105 00106 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00107 00108 float32_t *pIn1; /* inputA pointer */ 00109 float32_t *pIn2; /* inputB pointer */ 00110 float32_t *pOut = pDst; /* output pointer */ 00111 float32_t *px; /* Intermediate inputA pointer */ 00112 float32_t *py; /* Intermediate inputB pointer */ 00113 float32_t *pSrc1; /* Intermediate pointers */ 00114 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00115 float32_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ 00116 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counters */ 00117 int32_t inc = 1; /* Destination address modifier */ 00118 00119 00120 /* The algorithm implementation is based on the lengths of the inputs. */ 00121 /* srcB is always made to slide across srcA. */ 00122 /* So srcBLen is always considered as shorter or equal to srcALen */ 00123 /* But CORR(x, y) is reverse of CORR(y, x) */ 00124 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00125 /* and the destination pointer modifier, inc is set to -1 */ 00126 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00127 /* But to improve the performance, 00128 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00129 /* If srcALen > srcBLen, 00130 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00131 /* If srcALen < srcBLen, 00132 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00133 if(srcALen >= srcBLen) 00134 { 00135 /* Initialization of inputA pointer */ 00136 pIn1 = pSrcA; 00137 00138 /* Initialization of inputB pointer */ 00139 pIn2 = pSrcB; 00140 00141 /* Number of output samples is calculated */ 00142 outBlockSize = (2u * srcALen) - 1u; 00143 00144 /* When srcALen > srcBLen, zero padding has to be done to srcB 00145 * to make their lengths equal. 00146 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00147 * number of output samples are made zero */ 00148 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00149 00150 /* Updating the pointer position to non zero value */ 00151 pOut += j; 00152 00153 //while(j > 0u) 00154 //{ 00155 // /* Zero is stored in the destination buffer */ 00156 // *pOut++ = 0.0f; 00157 00158 // /* Decrement the loop counter */ 00159 // j--; 00160 //} 00161 00162 } 00163 else 00164 { 00165 /* Initialization of inputA pointer */ 00166 pIn1 = pSrcB; 00167 00168 /* Initialization of inputB pointer */ 00169 pIn2 = pSrcA; 00170 00171 /* srcBLen is always considered as shorter or equal to srcALen */ 00172 j = srcBLen; 00173 srcBLen = srcALen; 00174 srcALen = j; 00175 00176 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00177 /* Hence set the destination pointer to point to the last output sample */ 00178 pOut = pDst + ((srcALen + srcBLen) - 2u); 00179 00180 /* Destination address modifier is set to -1 */ 00181 inc = -1; 00182 00183 } 00184 00185 /* The function is internally 00186 * divided into three parts according to the number of multiplications that has to be 00187 * taken place between inputA samples and inputB samples. In the first part of the 00188 * algorithm, the multiplications increase by one for every iteration. 00189 * In the second part of the algorithm, srcBLen number of multiplications are done. 00190 * In the third part of the algorithm, the multiplications decrease by one 00191 * for every iteration.*/ 00192 /* The algorithm is implemented in three stages. 00193 * The loop counters of each stage is initiated here. */ 00194 blockSize1 = srcBLen - 1u; 00195 blockSize2 = srcALen - (srcBLen - 1u); 00196 blockSize3 = blockSize1; 00197 00198 /* -------------------------- 00199 * Initializations of stage1 00200 * -------------------------*/ 00201 00202 /* sum = x[0] * y[srcBlen - 1] 00203 * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1] 00204 * .... 00205 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00206 */ 00207 00208 /* In this stage the MAC operations are increased by 1 for every iteration. 00209 The count variable holds the number of MAC operations performed */ 00210 count = 1u; 00211 00212 /* Working pointer of inputA */ 00213 px = pIn1; 00214 00215 /* Working pointer of inputB */ 00216 pSrc1 = pIn2 + (srcBLen - 1u); 00217 py = pSrc1; 00218 00219 /* ------------------------ 00220 * Stage1 process 00221 * ----------------------*/ 00222 00223 /* The first stage starts here */ 00224 while(blockSize1 > 0u) 00225 { 00226 /* Accumulator is made zero for every iteration */ 00227 sum = 0.0f; 00228 00229 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00230 k = count >> 2u; 00231 00232 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00233 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00234 while(k > 0u) 00235 { 00236 /* x[0] * y[srcBLen - 4] */ 00237 sum += *px++ * *py++; 00238 /* x[1] * y[srcBLen - 3] */ 00239 sum += *px++ * *py++; 00240 /* x[2] * y[srcBLen - 2] */ 00241 sum += *px++ * *py++; 00242 /* x[3] * y[srcBLen - 1] */ 00243 sum += *px++ * *py++; 00244 00245 /* Decrement the loop counter */ 00246 k--; 00247 } 00248 00249 /* If the count is not a multiple of 4, compute any remaining MACs here. 00250 ** No loop unrolling is used. */ 00251 k = count % 0x4u; 00252 00253 while(k > 0u) 00254 { 00255 /* Perform the multiply-accumulate */ 00256 /* x[0] * y[srcBLen - 1] */ 00257 sum += *px++ * *py++; 00258 00259 /* Decrement the loop counter */ 00260 k--; 00261 } 00262 00263 /* Store the result in the accumulator in the destination buffer. */ 00264 *pOut = sum; 00265 /* Destination pointer is updated according to the address modifier, inc */ 00266 pOut += inc; 00267 00268 /* Update the inputA and inputB pointers for next MAC calculation */ 00269 py = pSrc1 - count; 00270 px = pIn1; 00271 00272 /* Increment the MAC count */ 00273 count++; 00274 00275 /* Decrement the loop counter */ 00276 blockSize1--; 00277 } 00278 00279 /* -------------------------- 00280 * Initializations of stage2 00281 * ------------------------*/ 00282 00283 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00284 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00285 * .... 00286 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00287 */ 00288 00289 /* Working pointer of inputA */ 00290 px = pIn1; 00291 00292 /* Working pointer of inputB */ 00293 py = pIn2; 00294 00295 /* count is index by which the pointer pIn1 to be incremented */ 00296 count = 1u; 00297 00298 /* ------------------- 00299 * Stage2 process 00300 * ------------------*/ 00301 00302 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00303 * So, to loop unroll over blockSize2, 00304 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ 00305 if(srcBLen >= 4u) 00306 { 00307 /* Loop unroll over blockSize2, by 4 */ 00308 blkCnt = blockSize2 >> 2u; 00309 00310 while(blkCnt > 0u) 00311 { 00312 /* Set all accumulators to zero */ 00313 acc0 = 0.0f; 00314 acc1 = 0.0f; 00315 acc2 = 0.0f; 00316 acc3 = 0.0f; 00317 00318 /* read x[0], x[1], x[2] samples */ 00319 x0 = *(px++); 00320 x1 = *(px++); 00321 x2 = *(px++); 00322 00323 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00324 k = srcBLen >> 2u; 00325 00326 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00327 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00328 do 00329 { 00330 /* Read y[0] sample */ 00331 c0 = *(py++); 00332 00333 /* Read x[3] sample */ 00334 x3 = *(px++); 00335 00336 /* Perform the multiply-accumulate */ 00337 /* acc0 += x[0] * y[0] */ 00338 acc0 += x0 * c0; 00339 /* acc1 += x[1] * y[0] */ 00340 acc1 += x1 * c0; 00341 /* acc2 += x[2] * y[0] */ 00342 acc2 += x2 * c0; 00343 /* acc3 += x[3] * y[0] */ 00344 acc3 += x3 * c0; 00345 00346 /* Read y[1] sample */ 00347 c0 = *(py++); 00348 00349 /* Read x[4] sample */ 00350 x0 = *(px++); 00351 00352 /* Perform the multiply-accumulate */ 00353 /* acc0 += x[1] * y[1] */ 00354 acc0 += x1 * c0; 00355 /* acc1 += x[2] * y[1] */ 00356 acc1 += x2 * c0; 00357 /* acc2 += x[3] * y[1] */ 00358 acc2 += x3 * c0; 00359 /* acc3 += x[4] * y[1] */ 00360 acc3 += x0 * c0; 00361 00362 /* Read y[2] sample */ 00363 c0 = *(py++); 00364 00365 /* Read x[5] sample */ 00366 x1 = *(px++); 00367 00368 /* Perform the multiply-accumulates */ 00369 /* acc0 += x[2] * y[2] */ 00370 acc0 += x2 * c0; 00371 /* acc1 += x[3] * y[2] */ 00372 acc1 += x3 * c0; 00373 /* acc2 += x[4] * y[2] */ 00374 acc2 += x0 * c0; 00375 /* acc3 += x[5] * y[2] */ 00376 acc3 += x1 * c0; 00377 00378 /* Read y[3] sample */ 00379 c0 = *(py++); 00380 00381 /* Read x[6] sample */ 00382 x2 = *(px++); 00383 00384 /* Perform the multiply-accumulates */ 00385 /* acc0 += x[3] * y[3] */ 00386 acc0 += x3 * c0; 00387 /* acc1 += x[4] * y[3] */ 00388 acc1 += x0 * c0; 00389 /* acc2 += x[5] * y[3] */ 00390 acc2 += x1 * c0; 00391 /* acc3 += x[6] * y[3] */ 00392 acc3 += x2 * c0; 00393 00394 00395 } while(--k); 00396 00397 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00398 ** No loop unrolling is used. */ 00399 k = srcBLen % 0x4u; 00400 00401 while(k > 0u) 00402 { 00403 /* Read y[4] sample */ 00404 c0 = *(py++); 00405 00406 /* Read x[7] sample */ 00407 x3 = *(px++); 00408 00409 /* Perform the multiply-accumulates */ 00410 /* acc0 += x[4] * y[4] */ 00411 acc0 += x0 * c0; 00412 /* acc1 += x[5] * y[4] */ 00413 acc1 += x1 * c0; 00414 /* acc2 += x[6] * y[4] */ 00415 acc2 += x2 * c0; 00416 /* acc3 += x[7] * y[4] */ 00417 acc3 += x3 * c0; 00418 00419 /* Reuse the present samples for the next MAC */ 00420 x0 = x1; 00421 x1 = x2; 00422 x2 = x3; 00423 00424 /* Decrement the loop counter */ 00425 k--; 00426 } 00427 00428 /* Store the result in the accumulator in the destination buffer. */ 00429 *pOut = acc0; 00430 /* Destination pointer is updated according to the address modifier, inc */ 00431 pOut += inc; 00432 00433 *pOut = acc1; 00434 pOut += inc; 00435 00436 *pOut = acc2; 00437 pOut += inc; 00438 00439 *pOut = acc3; 00440 pOut += inc; 00441 00442 /* Update the inputA and inputB pointers for next MAC calculation */ 00443 px = pIn1 + (count * 4u); 00444 py = pIn2; 00445 00446 /* Increment the pointer pIn1 index, count by 1 */ 00447 count++; 00448 00449 /* Decrement the loop counter */ 00450 blkCnt--; 00451 } 00452 00453 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00454 ** No loop unrolling is used. */ 00455 blkCnt = blockSize2 % 0x4u; 00456 00457 while(blkCnt > 0u) 00458 { 00459 /* Accumulator is made zero for every iteration */ 00460 sum = 0.0f; 00461 00462 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00463 k = srcBLen >> 2u; 00464 00465 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00466 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00467 while(k > 0u) 00468 { 00469 /* Perform the multiply-accumulates */ 00470 sum += *px++ * *py++; 00471 sum += *px++ * *py++; 00472 sum += *px++ * *py++; 00473 sum += *px++ * *py++; 00474 00475 /* Decrement the loop counter */ 00476 k--; 00477 } 00478 00479 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00480 ** No loop unrolling is used. */ 00481 k = srcBLen % 0x4u; 00482 00483 while(k > 0u) 00484 { 00485 /* Perform the multiply-accumulate */ 00486 sum += *px++ * *py++; 00487 00488 /* Decrement the loop counter */ 00489 k--; 00490 } 00491 00492 /* Store the result in the accumulator in the destination buffer. */ 00493 *pOut = sum; 00494 /* Destination pointer is updated according to the address modifier, inc */ 00495 pOut += inc; 00496 00497 /* Update the inputA and inputB pointers for next MAC calculation */ 00498 px = pIn1 + count; 00499 py = pIn2; 00500 00501 /* Increment the pointer pIn1 index, count by 1 */ 00502 count++; 00503 00504 /* Decrement the loop counter */ 00505 blkCnt--; 00506 } 00507 } 00508 else 00509 { 00510 /* If the srcBLen is not a multiple of 4, 00511 * the blockSize2 loop cannot be unrolled by 4 */ 00512 blkCnt = blockSize2; 00513 00514 while(blkCnt > 0u) 00515 { 00516 /* Accumulator is made zero for every iteration */ 00517 sum = 0.0f; 00518 00519 /* Loop over srcBLen */ 00520 k = srcBLen; 00521 00522 while(k > 0u) 00523 { 00524 /* Perform the multiply-accumulate */ 00525 sum += *px++ * *py++; 00526 00527 /* Decrement the loop counter */ 00528 k--; 00529 } 00530 00531 /* Store the result in the accumulator in the destination buffer. */ 00532 *pOut = sum; 00533 /* Destination pointer is updated according to the address modifier, inc */ 00534 pOut += inc; 00535 00536 /* Update the inputA and inputB pointers for next MAC calculation */ 00537 px = pIn1 + count; 00538 py = pIn2; 00539 00540 /* Increment the pointer pIn1 index, count by 1 */ 00541 count++; 00542 00543 /* Decrement the loop counter */ 00544 blkCnt--; 00545 } 00546 } 00547 00548 /* -------------------------- 00549 * Initializations of stage3 00550 * -------------------------*/ 00551 00552 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00553 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00554 * .... 00555 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00556 * sum += x[srcALen-1] * y[0] 00557 */ 00558 00559 /* In this stage the MAC operations are decreased by 1 for every iteration. 00560 The count variable holds the number of MAC operations performed */ 00561 count = srcBLen - 1u; 00562 00563 /* Working pointer of inputA */ 00564 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 00565 px = pSrc1; 00566 00567 /* Working pointer of inputB */ 00568 py = pIn2; 00569 00570 /* ------------------- 00571 * Stage3 process 00572 * ------------------*/ 00573 00574 while(blockSize3 > 0u) 00575 { 00576 /* Accumulator is made zero for every iteration */ 00577 sum = 0.0f; 00578 00579 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00580 k = count >> 2u; 00581 00582 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00583 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00584 while(k > 0u) 00585 { 00586 /* Perform the multiply-accumulates */ 00587 /* sum += x[srcALen - srcBLen + 4] * y[3] */ 00588 sum += *px++ * *py++; 00589 /* sum += x[srcALen - srcBLen + 3] * y[2] */ 00590 sum += *px++ * *py++; 00591 /* sum += x[srcALen - srcBLen + 2] * y[1] */ 00592 sum += *px++ * *py++; 00593 /* sum += x[srcALen - srcBLen + 1] * y[0] */ 00594 sum += *px++ * *py++; 00595 00596 /* Decrement the loop counter */ 00597 k--; 00598 } 00599 00600 /* If the count is not a multiple of 4, compute any remaining MACs here. 00601 ** No loop unrolling is used. */ 00602 k = count % 0x4u; 00603 00604 while(k > 0u) 00605 { 00606 /* Perform the multiply-accumulates */ 00607 sum += *px++ * *py++; 00608 00609 /* Decrement the loop counter */ 00610 k--; 00611 } 00612 00613 /* Store the result in the accumulator in the destination buffer. */ 00614 *pOut = sum; 00615 /* Destination pointer is updated according to the address modifier, inc */ 00616 pOut += inc; 00617 00618 /* Update the inputA and inputB pointers for next MAC calculation */ 00619 px = ++pSrc1; 00620 py = pIn2; 00621 00622 /* Decrement the MAC count */ 00623 count--; 00624 00625 /* Decrement the loop counter */ 00626 blockSize3--; 00627 } 00628 00629 #else 00630 00631 /* Run the below code for Cortex-M0 */ 00632 00633 float32_t *pIn1 = pSrcA; /* inputA pointer */ 00634 float32_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */ 00635 float32_t sum; /* Accumulator */ 00636 uint32_t i = 0u, j; /* loop counters */ 00637 uint32_t inv = 0u; /* Reverse order flag */ 00638 uint32_t tot = 0u; /* Length */ 00639 00640 /* The algorithm implementation is based on the lengths of the inputs. */ 00641 /* srcB is always made to slide across srcA. */ 00642 /* So srcBLen is always considered as shorter or equal to srcALen */ 00643 /* But CORR(x, y) is reverse of CORR(y, x) */ 00644 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00645 /* and a varaible, inv is set to 1 */ 00646 /* If lengths are not equal then zero pad has to be done to make the two 00647 * inputs of same length. But to improve the performance, we include zeroes 00648 * in the output instead of zero padding either of the the inputs*/ 00649 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the 00650 * starting of the output buffer */ 00651 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the 00652 * ending of the output buffer */ 00653 /* Once the zero padding is done the remaining of the output is calcualted 00654 * using convolution but with the shorter signal time shifted. */ 00655 00656 /* Calculate the length of the remaining sequence */ 00657 tot = ((srcALen + srcBLen) - 2u); 00658 00659 if(srcALen > srcBLen) 00660 { 00661 /* Calculating the number of zeros to be padded to the output */ 00662 j = srcALen - srcBLen; 00663 00664 /* Initialise the pointer after zero padding */ 00665 pDst += j; 00666 } 00667 00668 else if(srcALen < srcBLen) 00669 { 00670 /* Initialization to inputB pointer */ 00671 pIn1 = pSrcB; 00672 00673 /* Initialization to the end of inputA pointer */ 00674 pIn2 = pSrcA + (srcALen - 1u); 00675 00676 /* Initialisation of the pointer after zero padding */ 00677 pDst = pDst + tot; 00678 00679 /* Swapping the lengths */ 00680 j = srcALen; 00681 srcALen = srcBLen; 00682 srcBLen = j; 00683 00684 /* Setting the reverse flag */ 00685 inv = 1; 00686 00687 } 00688 00689 /* Loop to calculate convolution for output length number of times */ 00690 for (i = 0u; i <= tot; i++) 00691 { 00692 /* Initialize sum with zero to carry on MAC operations */ 00693 sum = 0.0f; 00694 00695 /* Loop to perform MAC operations according to convolution equation */ 00696 for (j = 0u; j <= i; j++) 00697 { 00698 /* Check the array limitations */ 00699 if((((i - j) < srcBLen) && (j < srcALen))) 00700 { 00701 /* z[i] += x[i-j] * y[j] */ 00702 sum += pIn1[j] * pIn2[-((int32_t) i - j)]; 00703 } 00704 } 00705 /* Store the output in the destination buffer */ 00706 if(inv == 1) 00707 *pDst-- = sum; 00708 else 00709 *pDst++ = sum; 00710 } 00711 00712 #endif /* #ifndef ARM_MATH_CM0 */ 00713 00714 } 00715