git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_f32.c

   1 /* ----------------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_correlate_f32.c
   9 *
  10 * Description:   Correlation of floating-point sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated
  28 *
  29 * Version 0.0.7  2010/06/10
  30 *    Misra-C changes done
  31 *
  32 * -------------------------------------------------------------------------- */
  33
  34 #include "arm_math.h"
  35
  36 /**
  37  * @ingroup groupFilters
  38  */
  39
  40 /**
  41  * @defgroup Corr Correlation
  42  *
  43  * Correlation is a mathematical operation that is similar to convolution.
  44  * As with convolution, correlation uses two signals to produce a third signal.
  45  * The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
  46  * Correlation is commonly used to measure the similarity between two signals.
  47  * It has applications in pattern recognition, cryptanalysis, and searching.
  48  * The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
  49  * Fast versions of the Q15 and Q31 functions are also provided.
  50  *
  51  * \par Algorithm
  52  * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
  53  * The convolution of the two signals is denoted by
  54  * <pre>
  55  *                   c[n] = a[n] * b[n]
  56  * </pre>
  57  * In correlation, one of the signals is flipped in time
  58  * <pre>
  59  *                   c[n] = a[n] * b[-n]
  60  * </pre>
  61  *
  62  * \par
  63  * and this is mathematically defined as
  64  * \image html CorrelateEquation.gif
  65  * \par
  66  * The <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
  67  * The result <code>c[n]</code> is of length <code>2 * max(srcALen, srcBLen) - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2)</code>.
  68  * The output result is written to <code>pDst</code> and the calling function must allocate <code>2 * max(srcALen, srcBLen) - 1</code> words for the result.
  69  *
  70  * <b>Note</b>
  71  * \par
  72  * The <code>pDst</code> should be initialized to all zeros before being used.
  73  *
  74  * <b>Fixed-Point Behavior</b>
  75  * \par
  76  * Correlation requires summing up a large number of intermediate products.
  77  * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
  78  * Refer to the function specific documentation below for further details of the particular algorithm used.
  79  */
  80
  81 /**
  82  * @addtogroup Corr
  83  * @{
  84  */
  85 /**
  86  * @brief Correlation of floating-point sequences.
  87  * @param[in]  *pSrcA points to the first input sequence.
  88  * @param[in]  srcALen length of the first input sequence.
  89  * @param[in]  *pSrcB points to the second input sequence.
  90  * @param[in]  srcBLen length of the second input sequence.
  91  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
  92  * @return none.
  93  */
  94
  95 void arm_correlate_f32(
  96   float32_t * pSrcA,
  97   uint32_t srcALen,
  98   float32_t * pSrcB,
  99   uint32_t srcBLen,
 100   float32_t * pDst)
 101 {
 102
 103
 104 #ifndef ARM_MATH_CM0
 105
 106   /* Run the below code for Cortex-M4 and Cortex-M3 */
 107
 108   float32_t *pIn1;                               /* inputA pointer */
 109   float32_t *pIn2;                               /* inputB pointer */
 110   float32_t *pOut = pDst;                        /* output pointer */
 111   float32_t *px;                                 /* Intermediate inputA pointer */
 112   float32_t *py;                                 /* Intermediate inputB pointer */
 113   float32_t *pSrc1;                              /* Intermediate pointers */
 114   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulators */
 115   float32_t x0, x1, x2, x3, c0;                  /* temporary variables for holding input and coefficient values */
 116   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counters */
 117   int32_t inc = 1;                               /* Destination address modifier */
 118
 119
 120   /* The algorithm implementation is based on the lengths of the inputs. */
 121   /* srcB is always made to slide across srcA. */
 122   /* So srcBLen is always considered as shorter or equal to srcALen */
 123   /* But CORR(x, y) is reverse of CORR(y, x) */
 124   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
 125   /* and the destination pointer modifier, inc is set to -1 */
 126   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
 127   /* But to improve the performance,
 128    * we include zeroes in the output instead of zero padding either of the the inputs*/
 129   /* If srcALen > srcBLen,
 130    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
 131   /* If srcALen < srcBLen,
 132    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
 133   if(srcALen >= srcBLen)
 134   {
 135     /* Initialization of inputA pointer */
 136     pIn1 = pSrcA;
 137
 138     /* Initialization of inputB pointer */
 139     pIn2 = pSrcB;
 140
 141     /* Number of output samples is calculated */
 142     outBlockSize = (2u * srcALen) - 1u;
 143
 144     /* When srcALen > srcBLen, zero padding has to be done to srcB
 145      * to make their lengths equal.
 146      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
 147      * number of output samples are made zero */
 148     j = outBlockSize - (srcALen + (srcBLen - 1u));
 149
 150     /* Updating the pointer position to non zero value */
 151     pOut += j;
 152
 153     //while(j > 0u)
 154     //{
 155     //  /* Zero is stored in the destination buffer */
 156     //  *pOut++ = 0.0f;
 157
 158     //  /* Decrement the loop counter */
 159     //  j--;
 160     //}
 161
 162   }
 163   else
 164   {
 165     /* Initialization of inputA pointer */
 166     pIn1 = pSrcB;
 167
 168     /* Initialization of inputB pointer */
 169     pIn2 = pSrcA;
 170
 171     /* srcBLen is always considered as shorter or equal to srcALen */
 172     j = srcBLen;
 173     srcBLen = srcALen;
 174     srcALen = j;
 175
 176     /* CORR(x, y) = Reverse order(CORR(y, x)) */
 177     /* Hence set the destination pointer to point to the last output sample */
 178     pOut = pDst + ((srcALen + srcBLen) - 2u);
 179
 180     /* Destination address modifier is set to -1 */
 181     inc = -1;
 182
 183   }
 184
 185   /* The function is internally
 186    * divided into three parts according to the number of multiplications that has to be
 187    * taken place between inputA samples and inputB samples. In the first part of the
 188    * algorithm, the multiplications increase by one for every iteration.
 189    * In the second part of the algorithm, srcBLen number of multiplications are done.
 190    * In the third part of the algorithm, the multiplications decrease by one
 191    * for every iteration.*/
 192   /* The algorithm is implemented in three stages.
 193    * The loop counters of each stage is initiated here. */
 194   blockSize1 = srcBLen - 1u;
 195   blockSize2 = srcALen - (srcBLen - 1u);
 196   blockSize3 = blockSize1;
 197
 198   /* --------------------------
 199    * Initializations of stage1
 200    * -------------------------*/
 201
 202   /* sum = x[0] * y[srcBlen - 1]
 203    * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
 204    * ....
 205    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
 206    */
 207
 208   /* In this stage the MAC operations are increased by 1 for every iteration.
 209      The count variable holds the number of MAC operations performed */
 210   count = 1u;
 211
 212   /* Working pointer of inputA */
 213   px = pIn1;
 214
 215   /* Working pointer of inputB */
 216   pSrc1 = pIn2 + (srcBLen - 1u);
 217   py = pSrc1;
 218
 219   /* ------------------------
 220    * Stage1 process
 221    * ----------------------*/
 222
 223   /* The first stage starts here */
 224   while(blockSize1 > 0u)
 225   {
 226     /* Accumulator is made zero for every iteration */
 227     sum = 0.0f;
 228
 229     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 230     k = count >> 2u;
 231
 232     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 233      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 234     while(k > 0u)
 235     {
 236       /* x[0] * y[srcBLen - 4] */
 237       sum += *px++ * *py++;
 238       /* x[1] * y[srcBLen - 3] */
 239       sum += *px++ * *py++;
 240       /* x[2] * y[srcBLen - 2] */
 241       sum += *px++ * *py++;
 242       /* x[3] * y[srcBLen - 1] */
 243       sum += *px++ * *py++;
 244
 245       /* Decrement the loop counter */
 246       k--;
 247     }
 248
 249     /* If the count is not a multiple of 4, compute any remaining MACs here.
 250      ** No loop unrolling is used. */
 251     k = count % 0x4u;
 252
 253     while(k > 0u)
 254     {
 255       /* Perform the multiply-accumulate */
 256       /* x[0] * y[srcBLen - 1] */
 257       sum += *px++ * *py++;
 258
 259       /* Decrement the loop counter */
 260       k--;
 261     }
 262
 263     /* Store the result in the accumulator in the destination buffer. */
 264     *pOut = sum;
 265     /* Destination pointer is updated according to the address modifier, inc */
 266     pOut += inc;
 267
 268     /* Update the inputA and inputB pointers for next MAC calculation */
 269     py = pSrc1 - count;
 270     px = pIn1;
 271
 272     /* Increment the MAC count */
 273     count++;
 274
 275     /* Decrement the loop counter */
 276     blockSize1--;
 277   }
 278
 279   /* --------------------------
 280    * Initializations of stage2
 281    * ------------------------*/
 282
 283   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
 284    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
 285    * ....
 286    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 287    */
 288
 289   /* Working pointer of inputA */
 290   px = pIn1;
 291
 292   /* Working pointer of inputB */
 293   py = pIn2;
 294
 295   /* count is index by which the pointer pIn1 to be incremented */
 296   count = 1u;
 297
 298   /* -------------------
 299    * Stage2 process
 300    * ------------------*/
 301
 302   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 303    * So, to loop unroll over blockSize2,
 304    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
 305   if(srcBLen >= 4u)
 306   {
 307     /* Loop unroll over blockSize2, by 4 */
 308     blkCnt = blockSize2 >> 2u;
 309
 310     while(blkCnt > 0u)
 311     {
 312       /* Set all accumulators to zero */
 313       acc0 = 0.0f;
 314       acc1 = 0.0f;
 315       acc2 = 0.0f;
 316       acc3 = 0.0f;
 317
 318       /* read x[0], x[1], x[2] samples */
 319       x0 = *(px++);
 320       x1 = *(px++);
 321       x2 = *(px++);
 322
 323       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 324       k = srcBLen >> 2u;
 325
 326       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 327        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 328       do
 329       {
 330         /* Read y[0] sample */
 331         c0 = *(py++);
 332
 333         /* Read x[3] sample */
 334         x3 = *(px++);
 335
 336         /* Perform the multiply-accumulate */
 337         /* acc0 +=  x[0] * y[0] */
 338         acc0 += x0 * c0;
 339         /* acc1 +=  x[1] * y[0] */
 340         acc1 += x1 * c0;
 341         /* acc2 +=  x[2] * y[0] */
 342         acc2 += x2 * c0;
 343         /* acc3 +=  x[3] * y[0] */
 344         acc3 += x3 * c0;
 345
 346         /* Read y[1] sample */
 347         c0 = *(py++);
 348
 349         /* Read x[4] sample */
 350         x0 = *(px++);
 351
 352         /* Perform the multiply-accumulate */
 353         /* acc0 +=  x[1] * y[1] */
 354         acc0 += x1 * c0;
 355         /* acc1 +=  x[2] * y[1] */
 356         acc1 += x2 * c0;
 357         /* acc2 +=  x[3] * y[1] */
 358         acc2 += x3 * c0;
 359         /* acc3 +=  x[4] * y[1] */
 360         acc3 += x0 * c0;
 361
 362         /* Read y[2] sample */
 363         c0 = *(py++);
 364
 365         /* Read x[5] sample */
 366         x1 = *(px++);
 367
 368         /* Perform the multiply-accumulates */
 369         /* acc0 +=  x[2] * y[2] */
 370         acc0 += x2 * c0;
 371         /* acc1 +=  x[3] * y[2] */
 372         acc1 += x3 * c0;
 373         /* acc2 +=  x[4] * y[2] */
 374         acc2 += x0 * c0;
 375         /* acc3 +=  x[5] * y[2] */
 376         acc3 += x1 * c0;
 377
 378         /* Read y[3] sample */
 379         c0 = *(py++);
 380
 381         /* Read x[6] sample */
 382         x2 = *(px++);
 383
 384         /* Perform the multiply-accumulates */
 385         /* acc0 +=  x[3] * y[3] */
 386         acc0 += x3 * c0;
 387         /* acc1 +=  x[4] * y[3] */
 388         acc1 += x0 * c0;
 389         /* acc2 +=  x[5] * y[3] */
 390         acc2 += x1 * c0;
 391         /* acc3 +=  x[6] * y[3] */
 392         acc3 += x2 * c0;
 393
 394
 395       } while(--k);
 396
 397       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 398        ** No loop unrolling is used. */
 399       k = srcBLen % 0x4u;
 400
 401       while(k > 0u)
 402       {
 403         /* Read y[4] sample */
 404         c0 = *(py++);
 405
 406         /* Read x[7] sample */
 407         x3 = *(px++);
 408
 409         /* Perform the multiply-accumulates */
 410         /* acc0 +=  x[4] * y[4] */
 411         acc0 += x0 * c0;
 412         /* acc1 +=  x[5] * y[4] */
 413         acc1 += x1 * c0;
 414         /* acc2 +=  x[6] * y[4] */
 415         acc2 += x2 * c0;
 416         /* acc3 +=  x[7] * y[4] */
 417         acc3 += x3 * c0;
 418
 419         /* Reuse the present samples for the next MAC */
 420         x0 = x1;
 421         x1 = x2;
 422         x2 = x3;
 423
 424         /* Decrement the loop counter */
 425         k--;
 426       }
 427
 428       /* Store the result in the accumulator in the destination buffer. */
 429       *pOut = acc0;
 430       /* Destination pointer is updated according to the address modifier, inc */
 431       pOut += inc;
 432
 433       *pOut = acc1;
 434       pOut += inc;
 435
 436       *pOut = acc2;
 437       pOut += inc;
 438
 439       *pOut = acc3;
 440       pOut += inc;
 441
 442       /* Update the inputA and inputB pointers for next MAC calculation */
 443       px = pIn1 + (count * 4u);
 444       py = pIn2;
 445
 446       /* Increment the pointer pIn1 index, count by 1 */
 447       count++;
 448
 449       /* Decrement the loop counter */
 450       blkCnt--;
 451     }
 452
 453     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 454      ** No loop unrolling is used. */
 455     blkCnt = blockSize2 % 0x4u;
 456
 457     while(blkCnt > 0u)
 458     {
 459       /* Accumulator is made zero for every iteration */
 460       sum = 0.0f;
 461
 462       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 463       k = srcBLen >> 2u;
 464
 465       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 466        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 467       while(k > 0u)
 468       {
 469         /* Perform the multiply-accumulates */
 470         sum += *px++ * *py++;
 471         sum += *px++ * *py++;
 472         sum += *px++ * *py++;
 473         sum += *px++ * *py++;
 474
 475         /* Decrement the loop counter */
 476         k--;
 477       }
 478
 479       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 480        ** No loop unrolling is used. */
 481       k = srcBLen % 0x4u;
 482
 483       while(k > 0u)
 484       {
 485         /* Perform the multiply-accumulate */
 486         sum += *px++ * *py++;
 487
 488         /* Decrement the loop counter */
 489         k--;
 490       }
 491
 492       /* Store the result in the accumulator in the destination buffer. */
 493       *pOut = sum;
 494       /* Destination pointer is updated according to the address modifier, inc */
 495       pOut += inc;
 496
 497       /* Update the inputA and inputB pointers for next MAC calculation */
 498       px = pIn1 + count;
 499       py = pIn2;
 500
 501       /* Increment the pointer pIn1 index, count by 1 */
 502       count++;
 503
 504       /* Decrement the loop counter */
 505       blkCnt--;
 506     }
 507   }
 508   else
 509   {
 510     /* If the srcBLen is not a multiple of 4,
 511      * the blockSize2 loop cannot be unrolled by 4 */
 512     blkCnt = blockSize2;
 513
 514     while(blkCnt > 0u)
 515     {
 516       /* Accumulator is made zero for every iteration */
 517       sum = 0.0f;
 518
 519       /* Loop over srcBLen */
 520       k = srcBLen;
 521
 522       while(k > 0u)
 523       {
 524         /* Perform the multiply-accumulate */
 525         sum += *px++ * *py++;
 526
 527         /* Decrement the loop counter */
 528         k--;
 529       }
 530
 531       /* Store the result in the accumulator in the destination buffer. */
 532       *pOut = sum;
 533       /* Destination pointer is updated according to the address modifier, inc */
 534       pOut += inc;
 535
 536       /* Update the inputA and inputB pointers for next MAC calculation */
 537       px = pIn1 + count;
 538       py = pIn2;
 539
 540       /* Increment the pointer pIn1 index, count by 1 */
 541       count++;
 542
 543       /* Decrement the loop counter */
 544       blkCnt--;
 545     }
 546   }
 547
 548   /* --------------------------
 549    * Initializations of stage3
 550    * -------------------------*/
 551
 552   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 553    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 554    * ....
 555    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
 556    * sum +=  x[srcALen-1] * y[0]
 557    */
 558
 559   /* In this stage the MAC operations are decreased by 1 for every iteration.
 560      The count variable holds the number of MAC operations performed */
 561   count = srcBLen - 1u;
 562
 563   /* Working pointer of inputA */
 564   pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
 565   px = pSrc1;
 566
 567   /* Working pointer of inputB */
 568   py = pIn2;
 569
 570   /* -------------------
 571    * Stage3 process
 572    * ------------------*/
 573
 574   while(blockSize3 > 0u)
 575   {
 576     /* Accumulator is made zero for every iteration */
 577     sum = 0.0f;
 578
 579     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 580     k = count >> 2u;
 581
 582     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 583      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 584     while(k > 0u)
 585     {
 586       /* Perform the multiply-accumulates */
 587       /* sum += x[srcALen - srcBLen + 4] * y[3] */
 588       sum += *px++ * *py++;
 589       /* sum += x[srcALen - srcBLen + 3] * y[2] */
 590       sum += *px++ * *py++;
 591       /* sum += x[srcALen - srcBLen + 2] * y[1] */
 592       sum += *px++ * *py++;
 593       /* sum += x[srcALen - srcBLen + 1] * y[0] */
 594       sum += *px++ * *py++;
 595
 596       /* Decrement the loop counter */
 597       k--;
 598     }
 599
 600     /* If the count is not a multiple of 4, compute any remaining MACs here.
 601      ** No loop unrolling is used. */
 602     k = count % 0x4u;
 603
 604     while(k > 0u)
 605     {
 606       /* Perform the multiply-accumulates */
 607       sum += *px++ * *py++;
 608
 609       /* Decrement the loop counter */
 610       k--;
 611     }
 612
 613     /* Store the result in the accumulator in the destination buffer. */
 614     *pOut = sum;
 615     /* Destination pointer is updated according to the address modifier, inc */
 616     pOut += inc;
 617
 618     /* Update the inputA and inputB pointers for next MAC calculation */
 619     px = ++pSrc1;
 620     py = pIn2;
 621
 622     /* Decrement the MAC count */
 623     count--;
 624
 625     /* Decrement the loop counter */
 626     blockSize3--;
 627   }
 628
 629 #else
 630
 631   /* Run the below code for Cortex-M0 */
 632
 633   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
 634   float32_t *pIn2 = pSrcB + (srcBLen - 1u);      /* inputB pointer */
 635   float32_t sum;                                 /* Accumulator */
 636   uint32_t i = 0u, j;                            /* loop counters */
 637   uint32_t inv = 0u;                             /* Reverse order flag */
 638   uint32_t tot = 0u;                             /* Length */
 639
 640   /* The algorithm implementation is based on the lengths of the inputs. */
 641   /* srcB is always made to slide across srcA. */
 642   /* So srcBLen is always considered as shorter or equal to srcALen */
 643   /* But CORR(x, y) is reverse of CORR(y, x) */
 644   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
 645   /* and a varaible, inv is set to 1 */
 646   /* If lengths are not equal then zero pad has to be done to  make the two
 647    * inputs of same length. But to improve the performance, we include zeroes
 648    * in the output instead of zero padding either of the the inputs*/
 649   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
 650    * starting of the output buffer */
 651   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
 652    * ending of the output buffer */
 653   /* Once the zero padding is done the remaining of the output is calcualted
 654    * using convolution but with the shorter signal time shifted. */
 655
 656   /* Calculate the length of the remaining sequence */
 657   tot = ((srcALen + srcBLen) - 2u);
 658
 659   if(srcALen > srcBLen)
 660   {
 661     /* Calculating the number of zeros to be padded to the output */
 662     j = srcALen - srcBLen;
 663
 664     /* Initialise the pointer after zero padding */
 665     pDst += j;
 666   }
 667
 668   else if(srcALen < srcBLen)
 669   {
 670     /* Initialization to inputB pointer */
 671     pIn1 = pSrcB;
 672
 673     /* Initialization to the end of inputA pointer */
 674     pIn2 = pSrcA + (srcALen - 1u);
 675
 676     /* Initialisation of the pointer after zero padding */
 677     pDst = pDst + tot;
 678
 679     /* Swapping the lengths */
 680     j = srcALen;
 681     srcALen = srcBLen;
 682     srcBLen = j;
 683
 684     /* Setting the reverse flag */
 685     inv = 1;
 686
 687   }
 688
 689   /* Loop to calculate convolution for output length number of times */
 690   for (i = 0u; i <= tot; i++)
 691   {
 692     /* Initialize sum with zero to carry on MAC operations */
 693     sum = 0.0f;
 694
 695     /* Loop to perform MAC operations according to convolution equation */
 696     for (j = 0u; j <= i; j++)
 697     {
 698       /* Check the array limitations */
 699       if((((i - j) < srcBLen) && (j < srcALen)))
 700       {
 701         /* z[i] += x[i-j] * y[j] */
 702         sum += pIn1[j] * pIn2[-((int32_t) i - j)];
 703       }
 704     }
 705     /* Store the output in the destination buffer */
 706     if(inv == 1)
 707       *pDst-- = sum;
 708     else
 709       *pDst++ = sum;
 710   }
 711
 712 #endif /*   #ifndef ARM_MATH_CM0 */
 713
 714 }
 715
 716 /**
 717  * @} end of Corr group
 718  */