CMSIS DSP Software Library: arm_correlate

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------------   
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        15. July 2011  
00005 * $Revision:    V1.0.10  
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_correlate_f32.c   
00009 *   
00010 * Description:   Correlation of floating-point sequences.   
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Version 1.0.10 2011/7/15 
00015 *    Big Endian support added and Merged M0 and M3/M4 Source code.  
00016 *   
00017 * Version 1.0.3 2010/11/29  
00018 *    Re-organized the CMSIS folders and updated documentation.   
00019 *    
00020 * Version 1.0.2 2010/11/11   
00021 *    Documentation updated.    
00022 *   
00023 * Version 1.0.1 2010/10/05    
00024 *    Production release and review comments incorporated.   
00025 *   
00026 * Version 1.0.0 2010/09/20    
00027 *    Production release and review comments incorporated   
00028 *   
00029 * Version 0.0.7  2010/06/10    
00030 *    Misra-C changes done   
00031 *   
00032 * -------------------------------------------------------------------------- */
00033 
00034 #include "arm_math.h"
00035 
00095 void arm_correlate_f32(
00096   float32_t * pSrcA,
00097   uint32_t srcALen,
00098   float32_t * pSrcB,
00099   uint32_t srcBLen,
00100   float32_t * pDst)
00101 {
00102 
00103 
00104 #ifndef ARM_MATH_CM0
00105 
00106   /* Run the below code for Cortex-M4 and Cortex-M3 */
00107 
00108   float32_t *pIn1;                               /* inputA pointer */
00109   float32_t *pIn2;                               /* inputB pointer */
00110   float32_t *pOut = pDst;                        /* output pointer */
00111   float32_t *px;                                 /* Intermediate inputA pointer */
00112   float32_t *py;                                 /* Intermediate inputB pointer */
00113   float32_t *pSrc1;                              /* Intermediate pointers */
00114   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulators */
00115   float32_t x0, x1, x2, x3, c0;                  /* temporary variables for holding input and coefficient values */
00116   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counters */
00117   int32_t inc = 1;                               /* Destination address modifier */
00118 
00119 
00120   /* The algorithm implementation is based on the lengths of the inputs. */
00121   /* srcB is always made to slide across srcA. */
00122   /* So srcBLen is always considered as shorter or equal to srcALen */
00123   /* But CORR(x, y) is reverse of CORR(y, x) */
00124   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00125   /* and the destination pointer modifier, inc is set to -1 */
00126   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
00127   /* But to improve the performance,   
00128    * we include zeroes in the output instead of zero padding either of the the inputs*/
00129   /* If srcALen > srcBLen,   
00130    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
00131   /* If srcALen < srcBLen,   
00132    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
00133   if(srcALen >= srcBLen)
00134   {
00135     /* Initialization of inputA pointer */
00136     pIn1 = pSrcA;
00137 
00138     /* Initialization of inputB pointer */
00139     pIn2 = pSrcB;
00140 
00141     /* Number of output samples is calculated */
00142     outBlockSize = (2u * srcALen) - 1u;
00143 
00144     /* When srcALen > srcBLen, zero padding has to be done to srcB   
00145      * to make their lengths equal.   
00146      * Instead, (outBlockSize - (srcALen + srcBLen - 1))   
00147      * number of output samples are made zero */
00148     j = outBlockSize - (srcALen + (srcBLen - 1u));
00149 
00150     /* Updating the pointer position to non zero value */
00151     pOut += j;
00152 
00153     //while(j > 0u)  
00154     //{  
00155     //  /* Zero is stored in the destination buffer */  
00156     //  *pOut++ = 0.0f;  
00157 
00158     //  /* Decrement the loop counter */  
00159     //  j--;  
00160     //}  
00161 
00162   }
00163   else
00164   {
00165     /* Initialization of inputA pointer */
00166     pIn1 = pSrcB;
00167 
00168     /* Initialization of inputB pointer */
00169     pIn2 = pSrcA;
00170 
00171     /* srcBLen is always considered as shorter or equal to srcALen */
00172     j = srcBLen;
00173     srcBLen = srcALen;
00174     srcALen = j;
00175 
00176     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00177     /* Hence set the destination pointer to point to the last output sample */
00178     pOut = pDst + ((srcALen + srcBLen) - 2u);
00179 
00180     /* Destination address modifier is set to -1 */
00181     inc = -1;
00182 
00183   }
00184 
00185   /* The function is internally   
00186    * divided into three parts according to the number of multiplications that has to be   
00187    * taken place between inputA samples and inputB samples. In the first part of the   
00188    * algorithm, the multiplications increase by one for every iteration.   
00189    * In the second part of the algorithm, srcBLen number of multiplications are done.   
00190    * In the third part of the algorithm, the multiplications decrease by one   
00191    * for every iteration.*/
00192   /* The algorithm is implemented in three stages.   
00193    * The loop counters of each stage is initiated here. */
00194   blockSize1 = srcBLen - 1u;
00195   blockSize2 = srcALen - (srcBLen - 1u);
00196   blockSize3 = blockSize1;
00197 
00198   /* --------------------------   
00199    * Initializations of stage1   
00200    * -------------------------*/
00201 
00202   /* sum = x[0] * y[srcBlen - 1]   
00203    * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]   
00204    * ....   
00205    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]   
00206    */
00207 
00208   /* In this stage the MAC operations are increased by 1 for every iteration.   
00209      The count variable holds the number of MAC operations performed */
00210   count = 1u;
00211 
00212   /* Working pointer of inputA */
00213   px = pIn1;
00214 
00215   /* Working pointer of inputB */
00216   pSrc1 = pIn2 + (srcBLen - 1u);
00217   py = pSrc1;
00218 
00219   /* ------------------------   
00220    * Stage1 process   
00221    * ----------------------*/
00222 
00223   /* The first stage starts here */
00224   while(blockSize1 > 0u)
00225   {
00226     /* Accumulator is made zero for every iteration */
00227     sum = 0.0f;
00228 
00229     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00230     k = count >> 2u;
00231 
00232     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00233      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00234     while(k > 0u)
00235     {
00236       /* x[0] * y[srcBLen - 4] */
00237       sum += *px++ * *py++;
00238       /* x[1] * y[srcBLen - 3] */
00239       sum += *px++ * *py++;
00240       /* x[2] * y[srcBLen - 2] */
00241       sum += *px++ * *py++;
00242       /* x[3] * y[srcBLen - 1] */
00243       sum += *px++ * *py++;
00244 
00245       /* Decrement the loop counter */
00246       k--;
00247     }
00248 
00249     /* If the count is not a multiple of 4, compute any remaining MACs here.   
00250      ** No loop unrolling is used. */
00251     k = count % 0x4u;
00252 
00253     while(k > 0u)
00254     {
00255       /* Perform the multiply-accumulate */
00256       /* x[0] * y[srcBLen - 1] */
00257       sum += *px++ * *py++;
00258 
00259       /* Decrement the loop counter */
00260       k--;
00261     }
00262 
00263     /* Store the result in the accumulator in the destination buffer. */
00264     *pOut = sum;
00265     /* Destination pointer is updated according to the address modifier, inc */
00266     pOut += inc;
00267 
00268     /* Update the inputA and inputB pointers for next MAC calculation */
00269     py = pSrc1 - count;
00270     px = pIn1;
00271 
00272     /* Increment the MAC count */
00273     count++;
00274 
00275     /* Decrement the loop counter */
00276     blockSize1--;
00277   }
00278 
00279   /* --------------------------   
00280    * Initializations of stage2   
00281    * ------------------------*/
00282 
00283   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]   
00284    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]   
00285    * ....   
00286    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
00287    */
00288 
00289   /* Working pointer of inputA */
00290   px = pIn1;
00291 
00292   /* Working pointer of inputB */
00293   py = pIn2;
00294 
00295   /* count is index by which the pointer pIn1 to be incremented */
00296   count = 1u;
00297 
00298   /* -------------------   
00299    * Stage2 process   
00300    * ------------------*/
00301 
00302   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00303    * So, to loop unroll over blockSize2,   
00304    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
00305   if(srcBLen >= 4u)
00306   {
00307     /* Loop unroll over blockSize2, by 4 */
00308     blkCnt = blockSize2 >> 2u;
00309 
00310     while(blkCnt > 0u)
00311     {
00312       /* Set all accumulators to zero */
00313       acc0 = 0.0f;
00314       acc1 = 0.0f;
00315       acc2 = 0.0f;
00316       acc3 = 0.0f;
00317 
00318       /* read x[0], x[1], x[2] samples */
00319       x0 = *(px++);
00320       x1 = *(px++);
00321       x2 = *(px++);
00322 
00323       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00324       k = srcBLen >> 2u;
00325 
00326       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00327        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00328       do
00329       {
00330         /* Read y[0] sample */
00331         c0 = *(py++);
00332 
00333         /* Read x[3] sample */
00334         x3 = *(px++);
00335 
00336         /* Perform the multiply-accumulate */
00337         /* acc0 +=  x[0] * y[0] */
00338         acc0 += x0 * c0;
00339         /* acc1 +=  x[1] * y[0] */
00340         acc1 += x1 * c0;
00341         /* acc2 +=  x[2] * y[0] */
00342         acc2 += x2 * c0;
00343         /* acc3 +=  x[3] * y[0] */
00344         acc3 += x3 * c0;
00345 
00346         /* Read y[1] sample */
00347         c0 = *(py++);
00348 
00349         /* Read x[4] sample */
00350         x0 = *(px++);
00351 
00352         /* Perform the multiply-accumulate */
00353         /* acc0 +=  x[1] * y[1] */
00354         acc0 += x1 * c0;
00355         /* acc1 +=  x[2] * y[1] */
00356         acc1 += x2 * c0;
00357         /* acc2 +=  x[3] * y[1] */
00358         acc2 += x3 * c0;
00359         /* acc3 +=  x[4] * y[1] */
00360         acc3 += x0 * c0;
00361 
00362         /* Read y[2] sample */
00363         c0 = *(py++);
00364 
00365         /* Read x[5] sample */
00366         x1 = *(px++);
00367 
00368         /* Perform the multiply-accumulates */
00369         /* acc0 +=  x[2] * y[2] */
00370         acc0 += x2 * c0;
00371         /* acc1 +=  x[3] * y[2] */
00372         acc1 += x3 * c0;
00373         /* acc2 +=  x[4] * y[2] */
00374         acc2 += x0 * c0;
00375         /* acc3 +=  x[5] * y[2] */
00376         acc3 += x1 * c0;
00377 
00378         /* Read y[3] sample */
00379         c0 = *(py++);
00380 
00381         /* Read x[6] sample */
00382         x2 = *(px++);
00383 
00384         /* Perform the multiply-accumulates */
00385         /* acc0 +=  x[3] * y[3] */
00386         acc0 += x3 * c0;
00387         /* acc1 +=  x[4] * y[3] */
00388         acc1 += x0 * c0;
00389         /* acc2 +=  x[5] * y[3] */
00390         acc2 += x1 * c0;
00391         /* acc3 +=  x[6] * y[3] */
00392         acc3 += x2 * c0;
00393 
00394 
00395       } while(--k);
00396 
00397       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00398        ** No loop unrolling is used. */
00399       k = srcBLen % 0x4u;
00400 
00401       while(k > 0u)
00402       {
00403         /* Read y[4] sample */
00404         c0 = *(py++);
00405 
00406         /* Read x[7] sample */
00407         x3 = *(px++);
00408 
00409         /* Perform the multiply-accumulates */
00410         /* acc0 +=  x[4] * y[4] */
00411         acc0 += x0 * c0;
00412         /* acc1 +=  x[5] * y[4] */
00413         acc1 += x1 * c0;
00414         /* acc2 +=  x[6] * y[4] */
00415         acc2 += x2 * c0;
00416         /* acc3 +=  x[7] * y[4] */
00417         acc3 += x3 * c0;
00418 
00419         /* Reuse the present samples for the next MAC */
00420         x0 = x1;
00421         x1 = x2;
00422         x2 = x3;
00423 
00424         /* Decrement the loop counter */
00425         k--;
00426       }
00427 
00428       /* Store the result in the accumulator in the destination buffer. */
00429       *pOut = acc0;
00430       /* Destination pointer is updated according to the address modifier, inc */
00431       pOut += inc;
00432 
00433       *pOut = acc1;
00434       pOut += inc;
00435 
00436       *pOut = acc2;
00437       pOut += inc;
00438 
00439       *pOut = acc3;
00440       pOut += inc;
00441 
00442       /* Update the inputA and inputB pointers for next MAC calculation */
00443       px = pIn1 + (count * 4u);
00444       py = pIn2;
00445 
00446       /* Increment the pointer pIn1 index, count by 1 */
00447       count++;
00448 
00449       /* Decrement the loop counter */
00450       blkCnt--;
00451     }
00452 
00453     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00454      ** No loop unrolling is used. */
00455     blkCnt = blockSize2 % 0x4u;
00456 
00457     while(blkCnt > 0u)
00458     {
00459       /* Accumulator is made zero for every iteration */
00460       sum = 0.0f;
00461 
00462       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00463       k = srcBLen >> 2u;
00464 
00465       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00466        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00467       while(k > 0u)
00468       {
00469         /* Perform the multiply-accumulates */
00470         sum += *px++ * *py++;
00471         sum += *px++ * *py++;
00472         sum += *px++ * *py++;
00473         sum += *px++ * *py++;
00474 
00475         /* Decrement the loop counter */
00476         k--;
00477       }
00478 
00479       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00480        ** No loop unrolling is used. */
00481       k = srcBLen % 0x4u;
00482 
00483       while(k > 0u)
00484       {
00485         /* Perform the multiply-accumulate */
00486         sum += *px++ * *py++;
00487 
00488         /* Decrement the loop counter */
00489         k--;
00490       }
00491 
00492       /* Store the result in the accumulator in the destination buffer. */
00493       *pOut = sum;
00494       /* Destination pointer is updated according to the address modifier, inc */
00495       pOut += inc;
00496 
00497       /* Update the inputA and inputB pointers for next MAC calculation */
00498       px = pIn1 + count;
00499       py = pIn2;
00500 
00501       /* Increment the pointer pIn1 index, count by 1 */
00502       count++;
00503 
00504       /* Decrement the loop counter */
00505       blkCnt--;
00506     }
00507   }
00508   else
00509   {
00510     /* If the srcBLen is not a multiple of 4,   
00511      * the blockSize2 loop cannot be unrolled by 4 */
00512     blkCnt = blockSize2;
00513 
00514     while(blkCnt > 0u)
00515     {
00516       /* Accumulator is made zero for every iteration */
00517       sum = 0.0f;
00518 
00519       /* Loop over srcBLen */
00520       k = srcBLen;
00521 
00522       while(k > 0u)
00523       {
00524         /* Perform the multiply-accumulate */
00525         sum += *px++ * *py++;
00526 
00527         /* Decrement the loop counter */
00528         k--;
00529       }
00530 
00531       /* Store the result in the accumulator in the destination buffer. */
00532       *pOut = sum;
00533       /* Destination pointer is updated according to the address modifier, inc */
00534       pOut += inc;
00535 
00536       /* Update the inputA and inputB pointers for next MAC calculation */
00537       px = pIn1 + count;
00538       py = pIn2;
00539 
00540       /* Increment the pointer pIn1 index, count by 1 */
00541       count++;
00542 
00543       /* Decrement the loop counter */
00544       blkCnt--;
00545     }
00546   }
00547 
00548   /* --------------------------   
00549    * Initializations of stage3   
00550    * -------------------------*/
00551 
00552   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
00553    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
00554    * ....   
00555    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]   
00556    * sum +=  x[srcALen-1] * y[0]   
00557    */
00558 
00559   /* In this stage the MAC operations are decreased by 1 for every iteration.   
00560      The count variable holds the number of MAC operations performed */
00561   count = srcBLen - 1u;
00562 
00563   /* Working pointer of inputA */
00564   pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
00565   px = pSrc1;
00566 
00567   /* Working pointer of inputB */
00568   py = pIn2;
00569 
00570   /* -------------------   
00571    * Stage3 process   
00572    * ------------------*/
00573 
00574   while(blockSize3 > 0u)
00575   {
00576     /* Accumulator is made zero for every iteration */
00577     sum = 0.0f;
00578 
00579     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00580     k = count >> 2u;
00581 
00582     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00583      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00584     while(k > 0u)
00585     {
00586       /* Perform the multiply-accumulates */
00587       /* sum += x[srcALen - srcBLen + 4] * y[3] */
00588       sum += *px++ * *py++;
00589       /* sum += x[srcALen - srcBLen + 3] * y[2] */
00590       sum += *px++ * *py++;
00591       /* sum += x[srcALen - srcBLen + 2] * y[1] */
00592       sum += *px++ * *py++;
00593       /* sum += x[srcALen - srcBLen + 1] * y[0] */
00594       sum += *px++ * *py++;
00595 
00596       /* Decrement the loop counter */
00597       k--;
00598     }
00599 
00600     /* If the count is not a multiple of 4, compute any remaining MACs here.   
00601      ** No loop unrolling is used. */
00602     k = count % 0x4u;
00603 
00604     while(k > 0u)
00605     {
00606       /* Perform the multiply-accumulates */
00607       sum += *px++ * *py++;
00608 
00609       /* Decrement the loop counter */
00610       k--;
00611     }
00612 
00613     /* Store the result in the accumulator in the destination buffer. */
00614     *pOut = sum;
00615     /* Destination pointer is updated according to the address modifier, inc */
00616     pOut += inc;
00617 
00618     /* Update the inputA and inputB pointers for next MAC calculation */
00619     px = ++pSrc1;
00620     py = pIn2;
00621 
00622     /* Decrement the MAC count */
00623     count--;
00624 
00625     /* Decrement the loop counter */
00626     blockSize3--;
00627   }
00628 
00629 #else
00630 
00631   /* Run the below code for Cortex-M0 */
00632 
00633   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
00634   float32_t *pIn2 = pSrcB + (srcBLen - 1u);      /* inputB pointer */
00635   float32_t sum;                                 /* Accumulator */
00636   uint32_t i = 0u, j;                            /* loop counters */
00637   uint32_t inv = 0u;                             /* Reverse order flag */
00638   uint32_t tot = 0u;                             /* Length */
00639 
00640   /* The algorithm implementation is based on the lengths of the inputs. */
00641   /* srcB is always made to slide across srcA. */
00642   /* So srcBLen is always considered as shorter or equal to srcALen */
00643   /* But CORR(x, y) is reverse of CORR(y, x) */
00644   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00645   /* and a varaible, inv is set to 1 */
00646   /* If lengths are not equal then zero pad has to be done to  make the two   
00647    * inputs of same length. But to improve the performance, we include zeroes   
00648    * in the output instead of zero padding either of the the inputs*/
00649   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the   
00650    * starting of the output buffer */
00651   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the  
00652    * ending of the output buffer */
00653   /* Once the zero padding is done the remaining of the output is calcualted  
00654    * using convolution but with the shorter signal time shifted. */
00655 
00656   /* Calculate the length of the remaining sequence */
00657   tot = ((srcALen + srcBLen) - 2u);
00658 
00659   if(srcALen > srcBLen)
00660   {
00661     /* Calculating the number of zeros to be padded to the output */
00662     j = srcALen - srcBLen;
00663 
00664     /* Initialise the pointer after zero padding */
00665     pDst += j;
00666   }
00667 
00668   else if(srcALen < srcBLen)
00669   {
00670     /* Initialization to inputB pointer */
00671     pIn1 = pSrcB;
00672 
00673     /* Initialization to the end of inputA pointer */
00674     pIn2 = pSrcA + (srcALen - 1u);
00675 
00676     /* Initialisation of the pointer after zero padding */
00677     pDst = pDst + tot;
00678 
00679     /* Swapping the lengths */
00680     j = srcALen;
00681     srcALen = srcBLen;
00682     srcBLen = j;
00683 
00684     /* Setting the reverse flag */
00685     inv = 1;
00686 
00687   }
00688 
00689   /* Loop to calculate convolution for output length number of times */
00690   for (i = 0u; i <= tot; i++)
00691   {
00692     /* Initialize sum with zero to carry on MAC operations */
00693     sum = 0.0f;
00694 
00695     /* Loop to perform MAC operations according to convolution equation */
00696     for (j = 0u; j <= i; j++)
00697     {
00698       /* Check the array limitations */
00699       if((((i - j) < srcBLen) && (j < srcALen)))
00700       {
00701         /* z[i] += x[i-j] * y[j] */
00702         sum += pIn1[j] * pIn2[-((int32_t) i - j)];
00703       }
00704     }
00705     /* Store the output in the destination buffer */
00706     if(inv == 1)
00707       *pDst-- = sum;
00708     else
00709       *pDst++ = sum;
00710   }
00711 
00712 #endif /*   #ifndef ARM_MATH_CM0 */
00713 
00714 }
00715